diff --git a/.venv/Lib/site-packages/torch/nn/__pycache__/__init__.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06f32fadd20194d78225a850b8bf64b7605f229e
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/__pycache__/__init__.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/__pycache__/_reduction.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/__pycache__/_reduction.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6184f84c3ea3d240693a34ab171fa1b77bacab22
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/__pycache__/_reduction.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/__pycache__/common_types.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/__pycache__/common_types.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b1ef7caaee12a9665af4297fdfbbc4f13758093
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/__pycache__/common_types.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/__pycache__/functional.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/__pycache__/functional.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6cc7eb616a3b94ef06736bf9f9898ca2f7c77e10
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/__pycache__/functional.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/__pycache__/grad.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/__pycache__/grad.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..934df3c8c799e77d3016c7cbf6eff2199fd40dfd
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/__pycache__/grad.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/__pycache__/init.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/__pycache__/init.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3e26ca76c72bdb2addf1bca93a706309b2a2df3
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/__pycache__/init.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/__pycache__/parameter.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/__pycache__/parameter.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b094a410bff6c4e9ddfa64507f6968a2939f5799
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/__pycache__/parameter.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/quantized/__pycache__/__init__.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/quantized/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7461bf003888fe62c6c97b8e414f278a979d7bf6
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/quantized/__pycache__/__init__.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/quantized/__pycache__/functional.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/quantized/__pycache__/functional.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c43a16fa79e58d30789ce1865306ec462c064e1
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/quantized/__pycache__/functional.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/quantized/_reference/__init__.py b/.venv/Lib/site-packages/torch/nn/quantized/_reference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8e21237ee25e376260e342f766121de7baedaed
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/nn/quantized/_reference/__init__.py
@@ -0,0 +1 @@
+from torch.nn.quantized._reference.modules import *  # noqa: F403
diff --git a/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/__init__.py b/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4147ce3a447f7c963da35a2361901d21564e8dfd
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/__init__.py
@@ -0,0 +1,39 @@
+# flake8: noqa: F401
+r"""Quantized Reference Modules.
+
+This module is in the process of migration to
+`torch/ao/nn/quantized/reference`, and is kept here for
+compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/reference`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.reference.modules.conv import (
+    Conv1d,
+    Conv2d,
+    Conv3d,
+    ConvTranspose1d,
+    ConvTranspose2d,
+    ConvTranspose3d,
+)
+from torch.ao.nn.quantized.reference.modules.linear import Linear
+from torch.ao.nn.quantized.reference.modules.rnn import GRUCell, LSTM, LSTMCell, RNNCell
+from torch.ao.nn.quantized.reference.modules.sparse import Embedding, EmbeddingBag
+
+
+__all__ = [
+    "Linear",
+    "Conv1d",
+    "Conv2d",
+    "Conv3d",
+    "ConvTranspose1d",
+    "ConvTranspose2d",
+    "ConvTranspose3d",
+    "RNNCell",
+    "LSTMCell",
+    "GRUCell",
+    "LSTM",
+    "Embedding",
+    "EmbeddingBag",
+]
diff --git a/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/conv.py b/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..bca87e38ad6560a023797f6dfa8cb644ae7e6cf6
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/conv.py
@@ -0,0 +1,21 @@
+# flake8: noqa: F401
+r"""Quantized Reference Modules.
+
+This module is in the process of migration to
+`torch/ao/nn/quantized/reference`, and is kept here for
+compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/reference`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.reference.modules.conv import (
+    _ConvNd,
+    _ConvTransposeNd,
+    Conv1d,
+    Conv2d,
+    Conv3d,
+    ConvTranspose1d,
+    ConvTranspose2d,
+    ConvTranspose3d,
+)
diff --git a/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/linear.py b/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/linear.py
new file mode 100644
index 0000000000000000000000000000000000000000..f197031e21a2495a9aeb0ec25273bd28eedd37a7
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/linear.py
@@ -0,0 +1,12 @@
+# flake8: noqa: F401
+r"""Quantized Reference Modules.
+
+This module is in the process of migration to
+`torch/ao/nn/quantized/reference`, and is kept here for
+compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/reference`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.reference.modules.linear import Linear
diff --git a/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/rnn.py b/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..541c53bbb3a5d33b199431ba6d2d7aa325af4779
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/rnn.py
@@ -0,0 +1,19 @@
+# flake8: noqa: F401
+r"""Quantized Reference Modules.
+
+This module is in the process of migration to
+`torch/ao/nn/quantized/reference`, and is kept here for
+compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/reference`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.reference.modules.rnn import (
+    GRUCell,
+    LSTM,
+    LSTMCell,
+    RNNBase,
+    RNNCell,
+    RNNCellBase,
+)
diff --git a/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/sparse.py b/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/sparse.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b6d8594197330dba3303e18ac26d1aea0c58816
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/sparse.py
@@ -0,0 +1,12 @@
+# flake8: noqa: F401
+r"""Quantized Reference Modules.
+
+This module is in the process of migration to
+`torch/ao/nn/quantized/reference`, and is kept here for
+compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/reference`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.reference.modules.sparse import Embedding, EmbeddingBag
diff --git a/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/utils.py b/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..31dc0838be7cd47e1a5004dccac7ca6f3f3d6003
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/nn/quantized/_reference/modules/utils.py
@@ -0,0 +1,18 @@
+# flake8: noqa: F401
+r"""Quantized Reference Modules.
+
+This module is in the process of migration to
+`torch/ao/nn/quantized/reference`, and is kept here for
+compatibility while the migration process is ongoing.
+If you are adding a new entry/functionality, please, add it to the
+appropriate file under the `torch/ao/nn/quantized/reference`,
+while adding an import statement here.
+"""
+
+from torch.ao.nn.quantized.reference.modules.utils import (
+    _get_weight_qparam_keys,
+    _quantize_and_dequantize_weight,
+    _quantize_weight,
+    _save_weight_qparams,
+    ReferenceQuantizedModule,
+)
diff --git a/.venv/Lib/site-packages/torch/nn/utils/__pycache__/__init__.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/__init__.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d547fc158a0d234811edb02385b688fa3e00c689
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/__init__.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/utils/__pycache__/_named_member_accessor.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/_named_member_accessor.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..075558cacf716a8cd97cfd802f59dd9aec477a41
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/_named_member_accessor.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/utils/__pycache__/clip_grad.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/clip_grad.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1aa485fe402a621007fad4237092a6156b472892
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/clip_grad.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/utils/__pycache__/convert_parameters.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/convert_parameters.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b83d8b2a3fbd7021dd4e65d00b73348d2cb96347
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/convert_parameters.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/utils/__pycache__/fusion.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/fusion.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..546baef1b688c24a18a4cd4c33a79a24db3256d6
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/fusion.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/utils/__pycache__/init.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/init.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f77ffdd60725cac5599814bea44b5cbec1152e6
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/init.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/utils/__pycache__/memory_format.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/memory_format.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c7cdc8d15386744d0a7fe2bcb8e185f7eab7da4
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/memory_format.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/utils/__pycache__/parametrizations.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/parametrizations.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..745fc10236be52162e594e6f61a0ca11eb0bfe0c
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/parametrizations.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/utils/__pycache__/parametrize.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/parametrize.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7177ef877502d22ea67395fbcae64573d20f4d9a
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/parametrize.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/utils/__pycache__/rnn.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/rnn.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66dcd3f591e86ccdc07a4385b65391d82eb3c35a
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/rnn.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/utils/__pycache__/spectral_norm.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/spectral_norm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b63d476cfd880148d906c3d0f7245186db416ef7
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/spectral_norm.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/utils/__pycache__/stateless.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/stateless.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5803b954a0403ab5fa74f49c09397c2136ba043
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/stateless.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/utils/__pycache__/weight_norm.cpython-39.pyc b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/weight_norm.cpython-39.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..911d973a5f6b93b3961ff3579ebbe88d54e2d46d
Binary files /dev/null and b/.venv/Lib/site-packages/torch/nn/utils/__pycache__/weight_norm.cpython-39.pyc differ
diff --git a/.venv/Lib/site-packages/torch/nn/utils/_expanded_weights/__init__.py b/.venv/Lib/site-packages/torch/nn/utils/_expanded_weights/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a2d51bd98c7b24660795ec873b8b2faa4605061
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/nn/utils/_expanded_weights/__init__.py
@@ -0,0 +1,10 @@
+from .conv_expanded_weights import ConvPerSampleGrad
+from .embedding_expanded_weights import EmbeddingPerSampleGrad
+from .expanded_weights_impl import ExpandedWeight
+from .group_norm_expanded_weights import GroupNormPerSampleGrad
+from .instance_norm_expanded_weights import InstanceNormPerSampleGrad
+from .layer_norm_expanded_weights import LayerNormPerSampleGrad
+from .linear_expanded_weights import LinearPerSampleGrad
+
+
+__all__ = ["ExpandedWeight"]
diff --git a/.venv/Lib/site-packages/torch/nn/utils/clip_grad.py b/.venv/Lib/site-packages/torch/nn/utils/clip_grad.py
new file mode 100644
index 0000000000000000000000000000000000000000..5024d8de132465757ce3b1bda9de6906ae968113
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/nn/utils/clip_grad.py
@@ -0,0 +1,189 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+import functools
+from typing import cast, Dict, Iterable, List, Optional, Tuple, Union
+from typing_extensions import deprecated
+
+import torch
+from torch import Tensor
+from torch.utils._foreach_utils import (
+    _device_has_foreach_support,
+    _group_tensors_by_device_and_dtype,
+    _has_foreach_support,
+)
+
+
+__all__ = ["clip_grad_norm_", "clip_grad_norm", "clip_grad_value_"]
+
+
+_tensor_or_tensors = Union[torch.Tensor, Iterable[torch.Tensor]]
+
+
+def _no_grad(func):
+    """
+    This wrapper is needed to avoid a circular import when using @torch.no_grad on the exposed functions
+    clip_grad_norm_ and clip_grad_value_ themselves.
+    """
+
+    def _no_grad_wrapper(*args, **kwargs):
+        with torch.no_grad():
+            return func(*args, **kwargs)
+
+    functools.update_wrapper(_no_grad_wrapper, func)
+    return _no_grad_wrapper
+
+
+@_no_grad
+def clip_grad_norm_(
+    parameters: _tensor_or_tensors,
+    max_norm: float,
+    norm_type: float = 2.0,
+    error_if_nonfinite: bool = False,
+    foreach: Optional[bool] = None,
+) -> torch.Tensor:
+    r"""Clip the gradient norm of an iterable of parameters.
+
+    The norm is computed over the norms of the individual gradients of all parameters,
+    as if the norms of the individual gradients were concatenated into a single vector.
+    Gradients are modified in-place.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        max_norm (float): max norm of the gradients
+        norm_type (float): type of the used p-norm. Can be ``'inf'`` for
+            infinity norm.
+        error_if_nonfinite (bool): if True, an error is thrown if the total
+            norm of the gradients from :attr:`parameters` is ``nan``,
+            ``inf``, or ``-inf``. Default: False (will switch to True in the future)
+        foreach (bool): use the faster foreach-based implementation.
+            If ``None``, use the foreach implementation for CUDA and CPU native tensors and silently
+            fall back to the slow implementation for other device types.
+            Default: ``None``
+
+    Returns:
+        Total norm of the parameter gradients (viewed as a single vector).
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    grads = [p.grad for p in parameters if p.grad is not None]
+    max_norm = float(max_norm)
+    norm_type = float(norm_type)
+    if len(grads) == 0:
+        return torch.tensor(0.0)
+    first_device = grads[0].device
+    grouped_grads: Dict[
+        Tuple[torch.device, torch.dtype], Tuple[List[List[Tensor]], List[int]]
+    ] = _group_tensors_by_device_and_dtype(
+        [grads]
+    )  # type: ignore[assignment]
+
+    norms: List[Tensor] = []
+    for (device, _), ([device_grads], _) in grouped_grads.items():  # type: ignore[assignment]
+        if (foreach is None and _has_foreach_support(device_grads, device)) or (
+            foreach and _device_has_foreach_support(device)
+        ):
+            norms.extend(torch._foreach_norm(device_grads, norm_type))
+        elif foreach:
+            raise RuntimeError(
+                f"foreach=True was passed, but can't use the foreach API on {device.type} tensors"
+            )
+        else:
+            norms.extend([torch.linalg.vector_norm(g, norm_type) for g in device_grads])
+
+    total_norm = torch.linalg.vector_norm(
+        torch.stack([norm.to(first_device) for norm in norms]), norm_type
+    )
+
+    if error_if_nonfinite and torch.logical_or(total_norm.isnan(), total_norm.isinf()):
+        raise RuntimeError(
+            f"The total norm of order {norm_type} for gradients from "
+            "`parameters` is non-finite, so it cannot be clipped. To disable "
+            "this error and scale the gradients by the non-finite norm anyway, "
+            "set `error_if_nonfinite=False`"
+        )
+    clip_coef = max_norm / (total_norm + 1e-6)
+    # Note: multiplying by the clamped coef is redundant when the coef is clamped to 1, but doing so
+    # avoids a `if clip_coef < 1:` conditional which can require a CPU <=> device synchronization
+    # when the gradients do not reside in CPU memory.
+    clip_coef_clamped = torch.clamp(clip_coef, max=1.0)
+    for (device, _), ([device_grads], _) in grouped_grads.items():  # type: ignore[assignment]
+        if (foreach is None and _has_foreach_support(device_grads, device)) or (
+            foreach and _device_has_foreach_support(device)
+        ):
+            torch._foreach_mul_(device_grads, clip_coef_clamped.to(device))
+        elif foreach:
+            raise RuntimeError(
+                f"foreach=True was passed, but can't use the foreach API on {device.type} tensors"
+            )
+        else:
+            clip_coef_clamped_device = clip_coef_clamped.to(device)
+            for g in device_grads:
+                g.mul_(clip_coef_clamped_device)
+
+    return total_norm
+
+
+@deprecated(
+    "`torch.nn.utils.clip_grad_norm` is now deprecated "
+    "in favor of `torch.nn.utils.clip_grad_norm_`.",
+    category=FutureWarning,
+)
+def clip_grad_norm(
+    parameters: _tensor_or_tensors,
+    max_norm: float,
+    norm_type: float = 2.0,
+    error_if_nonfinite: bool = False,
+    foreach: Optional[bool] = None,
+) -> torch.Tensor:
+    r"""Clip the gradient norm of an iterable of parameters.
+
+    .. warning::
+        This method is now deprecated in favor of
+        :func:`torch.nn.utils.clip_grad_norm_`.
+    """
+    return clip_grad_norm_(parameters, max_norm, norm_type, error_if_nonfinite, foreach)
+
+
+@_no_grad
+def clip_grad_value_(
+    parameters: _tensor_or_tensors,
+    clip_value: float,
+    foreach: Optional[bool] = None,
+) -> None:
+    r"""Clip the gradients of an iterable of parameters at specified value.
+
+    Gradients are modified in-place.
+
+    Args:
+        parameters (Iterable[Tensor] or Tensor): an iterable of Tensors or a
+            single Tensor that will have gradients normalized
+        clip_value (float): maximum allowed value of the gradients.
+            The gradients are clipped in the range
+            :math:`\left[\text{-clip\_value}, \text{clip\_value}\right]`
+        foreach (bool): use the faster foreach-based implementation
+            If ``None``, use the foreach implementation for CUDA and CPU native tensors and
+            silently fall back to the slow implementation for other device types.
+            Default: ``None``
+    """
+    if isinstance(parameters, torch.Tensor):
+        parameters = [parameters]
+    clip_value = float(clip_value)
+
+    grads = [p.grad for p in parameters if p.grad is not None]
+    grouped_grads = _group_tensors_by_device_and_dtype([grads])
+
+    for (device, _), ([grads], _) in grouped_grads.items():  # type: ignore[assignment]
+        if (
+            foreach is None
+            and _has_foreach_support(cast(List[Tensor], grads), device=device)
+        ) or (foreach and _device_has_foreach_support(device)):
+            torch._foreach_clamp_min_(cast(List[Tensor], grads), -clip_value)
+            torch._foreach_clamp_max_(cast(List[Tensor], grads), clip_value)
+        elif foreach:
+            raise RuntimeError(
+                f"foreach=True was passed, but can't use the foreach API on {device.type} tensors"
+            )
+        else:
+            for grad in grads:
+                cast(Tensor, grad).clamp_(min=-clip_value, max=clip_value)
diff --git a/.venv/Lib/site-packages/torch/onnx/__init__.py b/.venv/Lib/site-packages/torch/onnx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4949c960fa653fd9d1b34962ba927470d51d3eae
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/__init__.py
@@ -0,0 +1,553 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+
+__all__ = [
+    # Modules
+    "symbolic_helper",
+    "utils",
+    "errors",
+    # All opsets
+    "symbolic_caffe2",
+    "symbolic_opset7",
+    "symbolic_opset8",
+    "symbolic_opset9",
+    "symbolic_opset10",
+    "symbolic_opset11",
+    "symbolic_opset12",
+    "symbolic_opset13",
+    "symbolic_opset14",
+    "symbolic_opset15",
+    "symbolic_opset16",
+    "symbolic_opset17",
+    "symbolic_opset18",
+    "symbolic_opset19",
+    "symbolic_opset20",
+    # Enums
+    "ExportTypes",
+    "OperatorExportTypes",
+    "TrainingMode",
+    "TensorProtoDataType",
+    "JitScalarType",
+    # Public functions
+    "export",
+    "export_to_pretty_string",
+    "is_in_onnx_export",
+    "select_model_mode_for_export",
+    "register_custom_op_symbolic",
+    "unregister_custom_op_symbolic",
+    "disable_log",
+    "enable_log",
+    # Base error
+    "OnnxExporterError",
+    # Dynamo Exporter
+    "DiagnosticOptions",
+    "ExportOptions",
+    "ONNXProgram",
+    "ONNXRuntimeOptions",
+    "OnnxRegistry",
+    "dynamo_export",
+    "enable_fake_mode",
+    # DORT / torch.compile
+    "is_onnxrt_backend_supported",
+]
+
+from typing import Any, Callable, Collection, Mapping, Sequence, TYPE_CHECKING
+
+import torch
+from torch import _C
+from torch._C import _onnx as _C_onnx
+from torch._C._onnx import OperatorExportTypes, TensorProtoDataType, TrainingMode
+
+from ._exporter_states import ExportTypes
+from ._internal.onnxruntime import (
+    is_onnxrt_backend_supported,
+    OrtBackend as _OrtBackend,
+    OrtBackendOptions as _OrtBackendOptions,
+    OrtExecutionProvider as _OrtExecutionProvider,
+)
+from ._type_utils import JitScalarType
+from .errors import OnnxExporterError
+from .utils import (
+    _optimize_graph,
+    _run_symbolic_function,
+    _run_symbolic_method,
+    export_to_pretty_string,
+    is_in_onnx_export,
+    register_custom_op_symbolic,
+    select_model_mode_for_export,
+    unregister_custom_op_symbolic,
+)
+
+
+from . import (  # usort: skip. Keep the order instead of sorting lexicographically
+    errors,
+    symbolic_caffe2,
+    symbolic_helper,
+    symbolic_opset7,
+    symbolic_opset8,
+    symbolic_opset9,
+    symbolic_opset10,
+    symbolic_opset11,
+    symbolic_opset12,
+    symbolic_opset13,
+    symbolic_opset14,
+    symbolic_opset15,
+    symbolic_opset16,
+    symbolic_opset17,
+    symbolic_opset18,
+    symbolic_opset19,
+    symbolic_opset20,
+    utils,
+)
+
+
+from ._internal._exporter_legacy import (  # usort: skip. needs to be last to avoid circular import
+    DiagnosticOptions,
+    ExportOptions,
+    ONNXProgram,
+    ONNXRuntimeOptions,
+    OnnxRegistry,
+    enable_fake_mode,
+)
+
+
+if TYPE_CHECKING:
+    import os
+
+# Set namespace for exposed private names
+DiagnosticOptions.__module__ = "torch.onnx"
+ExportOptions.__module__ = "torch.onnx"
+ExportTypes.__module__ = "torch.onnx"
+JitScalarType.__module__ = "torch.onnx"
+ONNXProgram.__module__ = "torch.onnx"
+ONNXRuntimeOptions.__module__ = "torch.onnx"
+OnnxExporterError.__module__ = "torch.onnx"
+OnnxRegistry.__module__ = "torch.onnx"
+_OrtBackend.__module__ = "torch.onnx"
+_OrtBackendOptions.__module__ = "torch.onnx"
+_OrtExecutionProvider.__module__ = "torch.onnx"
+enable_fake_mode.__module__ = "torch.onnx"
+is_onnxrt_backend_supported.__module__ = "torch.onnx"
+
+producer_name = "pytorch"
+producer_version = _C_onnx.PRODUCER_VERSION
+
+
+def export(
+    model: torch.nn.Module
+    | torch.export.ExportedProgram
+    | torch.jit.ScriptModule
+    | torch.jit.ScriptFunction,
+    args: tuple[Any, ...] = (),
+    f: str | os.PathLike | None = None,
+    *,
+    kwargs: dict[str, Any] | None = None,
+    export_params: bool = True,
+    verbose: bool | None = None,
+    input_names: Sequence[str] | None = None,
+    output_names: Sequence[str] | None = None,
+    opset_version: int | None = None,
+    dynamic_axes: Mapping[str, Mapping[int, str]]
+    | Mapping[str, Sequence[int]]
+    | None = None,
+    keep_initializers_as_inputs: bool = False,
+    dynamo: bool = False,
+    # Dynamo only options
+    external_data: bool = True,
+    dynamic_shapes: dict[str, Any] | tuple[Any, ...] | list[Any] | None = None,
+    report: bool = False,
+    verify: bool = False,
+    profile: bool = False,
+    dump_exported_program: bool = False,
+    artifacts_dir: str | os.PathLike = ".",
+    fallback: bool = False,
+    # Deprecated options
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX,
+    do_constant_folding: bool = True,
+    custom_opsets: Mapping[str, int] | None = None,
+    export_modules_as_functions: bool | Collection[type[torch.nn.Module]] = False,
+    autograd_inlining: bool = True,
+    **_: Any,  # ignored options
+) -> Any | None:
+    r"""Exports a model into ONNX format.
+
+    Args:
+        model: The model to be exported.
+        args: Example positional inputs. Any non-Tensor arguments will be hard-coded into the
+            exported model; any Tensor arguments will become inputs of the exported model,
+            in the order they occur in the tuple.
+        f: Path to the output ONNX model file. E.g. "model.onnx".
+        kwargs: Optional example keyword inputs.
+        export_params: If false, parameters (weights) will not be exported.
+        verbose: Whether to enable verbose logging.
+        input_names: names to assign to the input nodes of the graph, in order.
+        output_names: names to assign to the output nodes of the graph, in order.
+        opset_version: The version of the
+            `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
+            to target. Must be >= 7.
+        dynamic_axes:
+
+            By default the exported model will have the shapes of all input and output tensors
+            set to exactly match those given in ``args``. To specify axes of tensors as
+            dynamic (i.e. known only at run-time), set ``dynamic_axes`` to a dict with schema:
+
+            * KEY (str): an input or output name. Each name must also be provided in ``input_names`` or
+                ``output_names``.
+            * VALUE (dict or list): If a dict, keys are axis indices and values are axis names. If a
+                list, each element is an axis index.
+
+            For example::
+
+                class SumModule(torch.nn.Module):
+                    def forward(self, x):
+                        return torch.sum(x, dim=1)
+
+
+                torch.onnx.export(
+                    SumModule(),
+                    (torch.ones(2, 2),),
+                    "onnx.pb",
+                    input_names=["x"],
+                    output_names=["sum"],
+                )
+
+            Produces::
+
+                input {
+                  name: "x"
+                  ...
+                      shape {
+                        dim {
+                          dim_value: 2  # axis 0
+                        }
+                        dim {
+                          dim_value: 2  # axis 1
+                ...
+                output {
+                  name: "sum"
+                  ...
+                      shape {
+                        dim {
+                          dim_value: 2  # axis 0
+                ...
+
+            While::
+
+                torch.onnx.export(
+                    SumModule(),
+                    (torch.ones(2, 2),),
+                    "onnx.pb",
+                    input_names=["x"],
+                    output_names=["sum"],
+                    dynamic_axes={
+                        # dict value: manually named axes
+                        "x": {0: "my_custom_axis_name"},
+                        # list value: automatic names
+                        "sum": [0],
+                    },
+                )
+
+            Produces::
+
+                input {
+                  name: "x"
+                  ...
+                      shape {
+                        dim {
+                          dim_param: "my_custom_axis_name"  # axis 0
+                        }
+                        dim {
+                          dim_value: 2  # axis 1
+                ...
+                output {
+                  name: "sum"
+                  ...
+                      shape {
+                        dim {
+                          dim_param: "sum_dynamic_axes_1"  # axis 0
+                ...
+
+        keep_initializers_as_inputs: If True, all the
+            initializers (typically corresponding to model weights) in the
+            exported graph will also be added as inputs to the graph. If False,
+            then initializers are not added as inputs to the graph, and only
+            the user inputs are added as inputs.
+
+            Set this to True if you intend to supply model weights at runtime.
+            Set it to False if the weights are static to allow for better optimizations
+            (e.g. constant folding) by backends/runtimes.
+
+        dynamo: Whether to export the model with ``torch.export`` ExportedProgram instead of TorchScript.
+        external_data: Whether to save the model weights as an external data file.
+            This is required for models with large weights that exceed the ONNX file size limit (2GB).
+            When False, the weights are saved in the ONNX file with the model architecture.
+        dynamic_shapes: A dictionary of dynamic shapes for the model inputs. Refer to
+            :func:`torch.export.export` for more details. This is only used (and preferred) when dynamo is True.
+            Only one parameter `dynamic_axes` or `dynamic_shapes` should be set
+            at the same time.
+        report: Whether to generate a markdown report for the export process.
+        verify: Whether to verify the exported model using ONNX Runtime.
+        profile: Whether to profile the export process.
+        dump_exported_program: Whether to dump the :class:`torch.export.ExportedProgram` to a file.
+            This is useful for debugging the exporter.
+        artifacts_dir: The directory to save the debugging artifacts like the report and the serialized
+            exported program.
+        fallback: Whether to fallback to the TorchScript exporter if the dynamo exporter fails.
+
+        training: Deprecated option. Instead, set the training mode of the model before exporting.
+        operator_export_type: Deprecated option. Only ONNX is supported.
+        do_constant_folding: Deprecated option. The exported graph is always optimized.
+        custom_opsets: Deprecated.
+            A dictionary:
+
+            * KEY (str): opset domain name
+            * VALUE (int): opset version
+
+            If a custom opset is referenced by ``model`` but not mentioned in this dictionary,
+            the opset version is set to 1. Only custom opset domain name and version should be
+            indicated through this argument.
+        export_modules_as_functions: Deprecated option.
+
+            Flag to enable
+            exporting all ``nn.Module`` forward calls as local functions in ONNX. Or a set to indicate the
+            particular types of modules to export as local functions in ONNX.
+            This feature requires ``opset_version`` >= 15, otherwise the export will fail. This is because
+            ``opset_version`` < 15 implies IR version < 8, which means no local function support.
+            Module variables will be exported as function attributes. There are two categories of function
+            attributes.
+
+            1. Annotated attributes: class variables that have type annotations via
+            `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_
+            will be exported as attributes.
+            Annotated attributes are not used inside the subgraph of ONNX local function because
+            they are not created by PyTorch JIT tracing, but they may be used by consumers
+            to determine whether or not to replace the function with a particular fused kernel.
+
+            2. Inferred attributes: variables that are used by operators inside the module. Attribute names
+            will have prefix "inferred::". This is to differentiate from predefined attributes retrieved from
+            python module annotations. Inferred attributes are used inside the subgraph of ONNX local function.
+
+            * ``False`` (default): export ``nn.Module`` forward calls as fine grained nodes.
+            * ``True``: export all ``nn.Module`` forward calls as local function nodes.
+            * Set of type of nn.Module: export ``nn.Module`` forward calls as local function nodes,
+                only if the type of the ``nn.Module`` is found in the set.
+        autograd_inlining: Deprecated.
+            Flag used to control whether to inline autograd functions.
+            Refer to https://github.com/pytorch/pytorch/pull/74765 for more details.
+    """
+    if dynamo is True or isinstance(model, torch.export.ExportedProgram):
+        from torch.onnx._internal import exporter
+
+        if isinstance(args, torch.Tensor):
+            args = (args,)
+        return exporter.export_compat(
+            model,
+            args,
+            f,
+            kwargs=kwargs,
+            export_params=export_params,
+            verbose=verbose,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=opset_version,
+            dynamic_axes=dynamic_axes,
+            keep_initializers_as_inputs=keep_initializers_as_inputs,
+            external_data=external_data,
+            dynamic_shapes=dynamic_shapes,
+            report=report,
+            verify=verify,
+            profile=profile,
+            dump_exported_program=dump_exported_program,
+            artifacts_dir=artifacts_dir,
+            fallback=fallback,
+        )
+    else:
+        from torch.onnx.utils import export
+
+        if dynamic_shapes:
+            raise ValueError(
+                "The exporter only supports dynamic shapes "
+                "through parameter dynamic_axes when dynamo=False."
+            )
+
+        export(
+            model,
+            args,
+            f,  # type: ignore[arg-type]
+            kwargs=kwargs,
+            export_params=export_params,
+            verbose=verbose is True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=opset_version,
+            dynamic_axes=dynamic_axes,
+            keep_initializers_as_inputs=keep_initializers_as_inputs,
+            training=training,
+            operator_export_type=operator_export_type,
+            do_constant_folding=do_constant_folding,
+            custom_opsets=custom_opsets,
+            export_modules_as_functions=export_modules_as_functions,
+            autograd_inlining=autograd_inlining,
+        )
+        return None
+
+
+def dynamo_export(
+    model: torch.nn.Module | Callable | torch.export.ExportedProgram,  # type: ignore[name-defined]
+    /,
+    *model_args,
+    export_options: ExportOptions | None = None,
+    **model_kwargs,
+) -> ONNXProgram | Any:
+    """Export a torch.nn.Module to an ONNX graph.
+
+    Args:
+        model: The PyTorch model to be exported to ONNX.
+        model_args: Positional inputs to ``model``.
+        model_kwargs: Keyword inputs to ``model``.
+        export_options: Options to influence the export to ONNX.
+
+    Returns:
+        An in-memory representation of the exported ONNX model.
+
+    **Example 1 - Simplest export**
+    ::
+
+        class MyModel(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+
+            def forward(self, x, bias=None):
+                out = self.linear(x)
+                out = out + bias
+                return out
+
+
+        model = MyModel()
+        kwargs = {"bias": 3.0}
+        args = (torch.randn(2, 2, 2),)
+        onnx_program = torch.onnx.dynamo_export(model, *args, **kwargs).save(
+            "my_simple_model.onnx"
+        )
+
+    **Example 2 - Exporting with dynamic shapes**
+    ::
+
+        # The previous model can be exported with dynamic shapes
+        export_options = torch.onnx.ExportOptions(dynamic_shapes=True)
+        onnx_program = torch.onnx.dynamo_export(
+            model, *args, **kwargs, export_options=export_options
+        )
+        onnx_program.save("my_dynamic_model.onnx")
+    """
+
+    # NOTE: The new exporter is experimental and is not enabled by default.
+    import warnings
+
+    from torch.onnx import _flags
+    from torch.onnx._internal import exporter
+    from torch.utils import _pytree
+
+    if isinstance(model, torch.export.ExportedProgram):
+        return exporter.export_compat(
+            model,  # type: ignore[arg-type]
+            model_args,
+            f=None,
+            kwargs=model_kwargs,
+            opset_version=18,
+            external_data=True,
+            export_params=True,
+            fallback=True,
+        )
+    elif _flags.USE_EXPERIMENTAL_LOGIC:
+        if export_options is not None:
+            warnings.warn(
+                "You are using an experimental ONNX export logic, which currently only supports dynamic shapes. "
+                "For a more comprehensive set of export options, including advanced features, please consider using "
+                "`torch.onnx.export(..., dynamo=True)`. ",
+                category=FutureWarning,
+            )
+
+        if export_options is not None and export_options.dynamic_shapes:
+            # Make all shapes dynamic
+            def _to_dynamic_shapes_mapper():
+                arg_order = 0
+
+                def _to_dynamic_shape(x):
+                    nonlocal arg_order
+                    if isinstance(x, torch.Tensor):
+                        rank = len(x.shape)
+                        dynamic_shape = {}
+                        for i in range(rank):
+                            dynamic_shape[i] = torch.export.Dim(
+                                f"arg_{arg_order}_dim_{i}"
+                            )
+                        arg_order += 1
+                        return dynamic_shape
+                    else:
+                        return None
+
+                return _to_dynamic_shape
+
+            # model_args could be nested
+            dynamic_shapes = _pytree.tree_map(
+                _to_dynamic_shapes_mapper(),
+                model_args,
+            )
+        else:
+            dynamic_shapes = None
+
+        return exporter.export_compat(
+            model,  # type: ignore[arg-type]
+            model_args,
+            f=None,
+            kwargs=model_kwargs,
+            dynamic_shapes=dynamic_shapes,
+            opset_version=18,
+            external_data=True,
+            export_params=True,
+            fallback=True,
+        )
+    else:
+        from torch.onnx._internal._exporter_legacy import dynamo_export
+
+        return dynamo_export(
+            model, *model_args, export_options=export_options, **model_kwargs
+        )
+
+
+# TODO(justinchuby): Deprecate these logging functions in favor of the new diagnostic module.
+
+# Returns True iff ONNX logging is turned on.
+is_onnx_log_enabled = _C._jit_is_onnx_log_enabled
+
+
+def enable_log() -> None:
+    r"""Enables ONNX logging."""
+    _C._jit_set_onnx_log_enabled(True)
+
+
+def disable_log() -> None:
+    r"""Disables ONNX logging."""
+    _C._jit_set_onnx_log_enabled(False)
+
+
+"""Sets output stream for ONNX logging.
+
+Args:
+    stream_name (str, default "stdout"): Only 'stdout' and 'stderr' are supported
+        as ``stream_name``.
+"""
+set_log_stream = _C._jit_set_onnx_log_output_stream
+
+
+"""A simple logging facility for ONNX exporter.
+
+Args:
+    args: Arguments are converted to string, concatenated together with a newline
+        character appended to the end, and flushed to output stream.
+"""
+log = _C._jit_onnx_log
diff --git a/.venv/Lib/site-packages/torch/onnx/_constants.py b/.venv/Lib/site-packages/torch/onnx/_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..d30e4c061e4d5e9e12db4c05df5c90a0ad5283f2
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_constants.py
@@ -0,0 +1,25 @@
+"""Constant values used in ONNX."""
+
+ONNX_ARCHIVE_MODEL_PROTO_NAME = "__MODEL_PROTO"
+
+ONNX_BASE_OPSET = 9
+ONNX_MIN_OPSET = 7
+ONNX_MAX_OPSET = 20
+ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET = 20
+# ONNX_DEFAULT_OPSET generated by tools/onnx/update_default_opset_version.py
+ONNX_DEFAULT_OPSET = 17
+ONNX_CONSTANT_FOLDING_MIN_OPSET = 9
+
+PYTORCH_GITHUB_ISSUES_URL = "https://github.com/pytorch/pytorch/issues"
+
+INT64_MAX = 9223372036854775807
+INT32_MAX = 2147483647
+INT16_MAX = 32767
+INT8_MAX = 127
+UINT8_MAX = 255
+
+INT64_MIN = -9223372036854775808
+INT32_MIN = -2147483648
+INT16_MIN = -32768
+INT8_MIN = -128
+UINT8_MIN = 0
diff --git a/.venv/Lib/site-packages/torch/onnx/_deprecation.py b/.venv/Lib/site-packages/torch/onnx/_deprecation.py
new file mode 100644
index 0000000000000000000000000000000000000000..713671b9b09807d909afd2d8da13b36086d94ddb
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_deprecation.py
@@ -0,0 +1,72 @@
+"""Utility for deprecating functions."""
+
+import functools
+import textwrap
+import warnings
+from typing import Callable, TypeVar
+from typing_extensions import ParamSpec
+
+
+_T = TypeVar("_T")
+_P = ParamSpec("_P")
+
+
+def deprecated(
+    since: str, removed_in: str, instructions: str
+) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
+    """Marks functions as deprecated.
+
+    It will result in a warning when the function is called and a note in the
+    docstring.
+
+    Args:
+        since: The version when the function was first deprecated.
+        removed_in: The version when the function will be removed.
+        instructions: The action users should take.
+    """
+
+    def decorator(function: Callable[_P, _T]) -> Callable[_P, _T]:
+        @functools.wraps(function)
+        def wrapper(*args: _P.args, **kwargs: _P.kwargs) -> _T:
+            warnings.warn(
+                f"'{function.__module__}.{function.__name__}' "
+                f"is deprecated in version {since} and will be "
+                f"removed in {removed_in}. Please {instructions}.",
+                category=FutureWarning,
+                stacklevel=2,
+            )
+            return function(*args, **kwargs)
+
+        # Add a deprecation note to the docstring.
+        docstring = function.__doc__ or ""
+
+        # Add a note to the docstring.
+        deprecation_note = textwrap.dedent(
+            f"""\
+            .. deprecated:: {since}
+                Deprecated and will be removed in version {removed_in}.
+                Please {instructions}.
+            """
+        )
+
+        # Split docstring at first occurrence of newline
+        summary_and_body = docstring.split("\n\n", 1)
+
+        if len(summary_and_body) > 1:
+            summary, body = summary_and_body
+
+            # Dedent the body. We cannot do this with the presence of the summary because
+            # the body contains leading whitespaces when the summary does not.
+            body = textwrap.dedent(body)
+
+            new_docstring_parts = [deprecation_note, "\n\n", summary, body]
+        else:
+            summary = summary_and_body[0]
+
+            new_docstring_parts = [deprecation_note, "\n\n", summary]
+
+        wrapper.__doc__ = "".join(new_docstring_parts)
+
+        return wrapper
+
+    return decorator
diff --git a/.venv/Lib/site-packages/torch/onnx/_experimental.py b/.venv/Lib/site-packages/torch/onnx/_experimental.py
new file mode 100644
index 0000000000000000000000000000000000000000..879fd55433c669a63d46800c88722bf7f7757595
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_experimental.py
@@ -0,0 +1,27 @@
+"""Experimental classes and functions used by ONNX export."""
+
+import dataclasses
+from typing import Mapping, Optional, Sequence, Set, Type, Union
+
+import torch
+import torch._C._onnx as _C_onnx
+
+
+@dataclasses.dataclass
+class ExportOptions:
+    """Arguments used by :func:`torch.onnx.export`."""
+
+    # TODO(justinchuby): Deprecate and remove this class.
+
+    export_params: bool = True
+    verbose: bool = False
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL
+    input_names: Optional[Sequence[str]] = None
+    output_names: Optional[Sequence[str]] = None
+    operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX
+    opset_version: Optional[int] = None
+    do_constant_folding: bool = True
+    dynamic_axes: Optional[Mapping[str, Union[Mapping[int, str], Sequence[int]]]] = None
+    keep_initializers_as_inputs: Optional[bool] = None
+    custom_opsets: Optional[Mapping[str, int]] = None
+    export_modules_as_functions: Union[bool, Set[Type[torch.nn.Module]]] = False
diff --git a/.venv/Lib/site-packages/torch/onnx/_exporter_states.py b/.venv/Lib/site-packages/torch/onnx/_exporter_states.py
new file mode 100644
index 0000000000000000000000000000000000000000..17144d2441c2839049f97e524ca34f2384f94438
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_exporter_states.py
@@ -0,0 +1,12 @@
+from __future__ import annotations
+
+
+class ExportTypes:
+    """Specifies how the ONNX model is stored."""
+
+    # TODO(justinchuby): Deprecate and remove this class.
+
+    PROTOBUF_FILE = "Saves model in the specified protobuf file."
+    ZIP_ARCHIVE = "Saves model in the specified ZIP file (uncompressed)."
+    COMPRESSED_ZIP_ARCHIVE = "Saves model in the specified ZIP file (compressed)."
+    DIRECTORY = "Saves model in the specified folder."
diff --git a/.venv/Lib/site-packages/torch/onnx/_flags.py b/.venv/Lib/site-packages/torch/onnx/_flags.py
new file mode 100644
index 0000000000000000000000000000000000000000..437707cfa657acd66b75ca74fb1b272b03318587
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_flags.py
@@ -0,0 +1,49 @@
+"""Internal feature flags for torch.onnx.
+
+NOTE: These flags are experimental only. Any flag here can be removed at any
+time without notice.
+"""
+
+import logging
+import os
+
+
+logger = logging.getLogger(__name__)
+
+
+def _load_boolean_flag(
+    name: str,
+    *,
+    this_will: str,
+    deprecated: bool = False,
+    default: bool = False,
+) -> bool:
+    """Load a boolean flag from environment variable.
+
+    Args:
+        name: The name of the environment variable.
+        this_will: A string that describes what this flag will do.
+        deprecated: Whether this flag is deprecated.
+        default: The default value if envvar not defined.
+    """
+    undefined = os.getenv(name) is None
+    state = os.getenv(name) == "1"
+    if state:
+        if deprecated:
+            logger.error(
+                "Experimental flag %s is deprecated. Please remove it from your environment.",
+                name,
+            )
+        else:
+            logger.warning(
+                "Experimental flag %s is enabled. This will %s.", name, this_will
+            )
+    if undefined:
+        state = default
+    return state
+
+
+USE_EXPERIMENTAL_LOGIC: bool = _load_boolean_flag(
+    "TORCH_ONNX_USE_EXPERIMENTAL_LOGIC",
+    this_will="use ExportedProgram and the new torch.onnx export logic",
+)
diff --git a/.venv/Lib/site-packages/torch/onnx/_globals.py b/.venv/Lib/site-packages/torch/onnx/_globals.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca19d611a6c17ffc7540c8f2ef268cc5ca43141c
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_globals.py
@@ -0,0 +1,87 @@
+# mypy: allow-untyped-defs
+"""Globals used internally by the ONNX exporter.
+
+Do not use this module outside of `torch.onnx` and its tests.
+
+Be very judicious when adding any new global variables. Do not create new global
+variables unless they are absolutely necessary.
+"""
+
+import torch._C._onnx as _C_onnx
+
+# This module should only depend on _constants and nothing else in torch.onnx to keep
+# dependency direction clean.
+from torch.onnx import _constants
+
+
+class _InternalGlobals:
+    """Globals used internally by ONNX exporter.
+
+    NOTE: Be very judicious when adding any new variables. Do not create new
+    global variables unless they are absolutely necessary.
+    """
+
+    def __init__(self) -> None:
+        self._export_onnx_opset_version = _constants.ONNX_DEFAULT_OPSET
+        self._training_mode: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL
+        self._in_onnx_export: bool = False
+        # Whether the user's model is training during export
+        self.export_training: bool = False
+        self.operator_export_type: _C_onnx.OperatorExportTypes = (
+            _C_onnx.OperatorExportTypes.ONNX
+        )
+        self.onnx_shape_inference: bool = True
+        self._autograd_inlining: bool = True
+
+    @property
+    def training_mode(self):
+        """The training mode for the exporter."""
+        return self._training_mode
+
+    @training_mode.setter
+    def training_mode(self, training_mode: _C_onnx.TrainingMode):
+        if not isinstance(training_mode, _C_onnx.TrainingMode):
+            raise TypeError(
+                "training_mode must be of type 'torch.onnx.TrainingMode'. This is "
+                "likely a bug in torch.onnx."
+            )
+        self._training_mode = training_mode
+
+    @property
+    def export_onnx_opset_version(self) -> int:
+        """Opset version used during export."""
+        return self._export_onnx_opset_version
+
+    @export_onnx_opset_version.setter
+    def export_onnx_opset_version(self, value: int):
+        supported_versions = range(
+            _constants.ONNX_MIN_OPSET, _constants.ONNX_MAX_OPSET + 1
+        )
+        if value not in supported_versions:
+            raise ValueError(f"Unsupported ONNX opset version: {value}")
+        self._export_onnx_opset_version = value
+
+    @property
+    def in_onnx_export(self) -> bool:
+        """Whether it is in the middle of ONNX export."""
+        return self._in_onnx_export
+
+    @in_onnx_export.setter
+    def in_onnx_export(self, value: bool):
+        if type(value) is not bool:
+            raise TypeError("in_onnx_export must be a boolean")
+        self._in_onnx_export = value
+
+    @property
+    def autograd_inlining(self) -> bool:
+        """Whether Autograd must be inlined."""
+        return self._autograd_inlining
+
+    @autograd_inlining.setter
+    def autograd_inlining(self, value: bool):
+        if type(value) is not bool:
+            raise TypeError("autograd_inlining must be a boolean")
+        self._autograd_inlining = value
+
+
+GLOBALS = _InternalGlobals()
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/__init__.py b/.venv/Lib/site-packages/torch/onnx/_internal/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/_lazy_import.py b/.venv/Lib/site-packages/torch/onnx/_internal/_lazy_import.py
new file mode 100644
index 0000000000000000000000000000000000000000..54baa88eea4e2ea9dc71967fc5aebc2ed98b21e6
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/_lazy_import.py
@@ -0,0 +1,41 @@
+"""Utility to lazily import modules."""
+
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import importlib
+from typing import Any, TYPE_CHECKING
+
+
+class _LazyModule:
+    """Lazily import a module."""
+
+    def __init__(self, module_name: str) -> None:
+        self._name = module_name
+        self._module: Any = None
+
+    def __repr__(self) -> str:
+        return f"<lazy module '{self._name}'>"
+
+    def __getattr__(self, attr):
+        if self._module is None:
+            self._module = importlib.import_module(".", self._name)
+        return getattr(self._module, attr)
+
+
+# Import the following modules during type checking to enable code intelligence features,
+# such as auto-completion in tools like pylance, even when these modules are not explicitly
+# imported in user code.
+# NOTE: Add additional used imports here.
+if TYPE_CHECKING:
+    import onnx
+    import onnxscript
+    import onnxscript._framework_apis.torch_2_5 as onnxscript_apis
+
+    onnxscript_ir = onnxscript.ir
+
+else:
+    onnx = _LazyModule("onnx")
+    onnxscript = _LazyModule("onnxscript")
+    onnxscript_ir = _LazyModule("onnxscript.ir")
+    onnxscript_apis = _LazyModule("onnxscript._framework_apis.torch_2_5")
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/__init__.py b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1524e370c1654c274ab0472f080d68c03c82dfd5
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/__init__.py
@@ -0,0 +1,22 @@
+from ._diagnostic import (
+    create_export_diagnostic_context,
+    diagnose,
+    engine,
+    export_context,
+    ExportDiagnosticEngine,
+    TorchScriptOnnxExportDiagnostic,
+)
+from ._rules import rules
+from .infra import levels
+
+
+__all__ = [
+    "TorchScriptOnnxExportDiagnostic",
+    "ExportDiagnosticEngine",
+    "rules",
+    "levels",
+    "engine",
+    "export_context",
+    "create_export_diagnostic_context",
+    "diagnose",
+]
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/_diagnostic.py b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/_diagnostic.py
new file mode 100644
index 0000000000000000000000000000000000000000..053872cd50ae83dae6c814338127cbe9c45f7d65
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/_diagnostic.py
@@ -0,0 +1,211 @@
+# mypy: allow-untyped-defs
+"""Diagnostic components for TorchScript based ONNX export, i.e. `torch.onnx.export`."""
+
+from __future__ import annotations
+
+import contextlib
+import gzip
+from typing import TYPE_CHECKING
+
+import torch
+from torch.onnx._internal.diagnostics import infra
+from torch.onnx._internal.diagnostics.infra import formatter, sarif
+from torch.onnx._internal.diagnostics.infra.sarif import version as sarif_version
+from torch.utils import cpp_backtrace
+
+
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+
+def _cpp_call_stack(frames_to_skip: int = 0, frames_to_log: int = 32) -> infra.Stack:
+    """Returns the current C++ call stack.
+
+    This function utilizes `torch.utils.cpp_backtrace` to get the current C++ call stack.
+    The returned C++ call stack is a concatenated string of the C++ call stack frames.
+    Each frame is separated by a newline character, in the same format of
+    r"frame #[0-9]+: (?P<frame_info>.*)". More info at `c10/util/Backtrace.cpp`.
+
+    """
+    frames = cpp_backtrace.get_cpp_backtrace(frames_to_skip, frames_to_log).split("\n")
+    frame_messages = []
+    for frame in frames:
+        segments = frame.split(":", 1)
+        if len(segments) == 2:
+            frame_messages.append(segments[1].strip())
+        else:
+            frame_messages.append("<unknown frame>")
+    return infra.Stack(
+        frames=[
+            infra.StackFrame(location=infra.Location(message=message))
+            for message in frame_messages
+        ]
+    )
+
+
+class TorchScriptOnnxExportDiagnostic(infra.Diagnostic):
+    """Base class for all export diagnostics.
+
+    This class is used to represent all export diagnostics. It is a subclass of
+    infra.Diagnostic, and adds additional methods to add more information to the
+    diagnostic.
+    """
+
+    python_call_stack: infra.Stack | None = None
+    cpp_call_stack: infra.Stack | None = None
+
+    def __init__(
+        self,
+        *args,
+        frames_to_skip: int = 1,
+        cpp_stack: bool = False,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.python_call_stack = self.record_python_call_stack(
+            frames_to_skip=frames_to_skip
+        )
+        if cpp_stack:
+            self.cpp_call_stack = self.record_cpp_call_stack(
+                frames_to_skip=frames_to_skip
+            )
+
+    def record_cpp_call_stack(self, frames_to_skip: int) -> infra.Stack:
+        """Records the current C++ call stack in the diagnostic."""
+        stack = _cpp_call_stack(frames_to_skip=frames_to_skip)
+        stack.message = "C++ call stack"
+        self.with_stack(stack)
+        return stack
+
+
+class ExportDiagnosticEngine:
+    """PyTorch ONNX Export diagnostic engine.
+
+    The only purpose of creating this class instead of using `DiagnosticContext` directly
+    is to provide a background context for `diagnose` calls inside exporter.
+
+    By design, one `torch.onnx.export` call should initialize one diagnostic context.
+    All `diagnose` calls inside exporter should be made in the context of that export.
+    However, since diagnostic context is currently being accessed via a global variable,
+    there is no guarantee that the context is properly initialized. Therefore, we need
+    to provide a default background context to fallback to, otherwise any invocation of
+    exporter internals, e.g. unit tests, will fail due to missing diagnostic context.
+    This can be removed once the pipeline for context to flow through the exporter is
+    established.
+    """
+
+    contexts: list[infra.DiagnosticContext]
+    _background_context: infra.DiagnosticContext
+
+    def __init__(self) -> None:
+        self.contexts = []
+        self._background_context = infra.DiagnosticContext(
+            name="torch.onnx",
+            version=torch.__version__,
+        )
+
+    @property
+    def background_context(self) -> infra.DiagnosticContext:
+        return self._background_context
+
+    def create_diagnostic_context(
+        self,
+        name: str,
+        version: str,
+        options: infra.DiagnosticOptions | None = None,
+    ) -> infra.DiagnosticContext:
+        """Creates a new diagnostic context.
+
+        Args:
+            name: The subject name for the diagnostic context.
+            version: The subject version for the diagnostic context.
+            options: The options for the diagnostic context.
+
+        Returns:
+            A new diagnostic context.
+        """
+        if options is None:
+            options = infra.DiagnosticOptions()
+        context: infra.DiagnosticContext[infra.Diagnostic] = infra.DiagnosticContext(
+            name, version, options
+        )
+        self.contexts.append(context)
+        return context
+
+    def clear(self):
+        """Clears all diagnostic contexts."""
+        self.contexts.clear()
+        self._background_context.diagnostics.clear()
+
+    def to_json(self) -> str:
+        return formatter.sarif_to_json(self.sarif_log())
+
+    def dump(self, file_path: str, compress: bool = False) -> None:
+        """Dumps the SARIF log to a file."""
+        if compress:
+            with gzip.open(file_path, "wt") as f:
+                f.write(self.to_json())
+        else:
+            with open(file_path, "w") as f:
+                f.write(self.to_json())
+
+    def sarif_log(self):
+        log = sarif.SarifLog(
+            version=sarif_version.SARIF_VERSION,
+            schema_uri=sarif_version.SARIF_SCHEMA_LINK,
+            runs=[context.sarif() for context in self.contexts],
+        )
+
+        log.runs.append(self._background_context.sarif())
+        return log
+
+
+engine = ExportDiagnosticEngine()
+_context = engine.background_context
+
+
+@contextlib.contextmanager
+def create_export_diagnostic_context() -> (
+    Generator[infra.DiagnosticContext, None, None]
+):
+    """Create a diagnostic context for export.
+
+    This is a workaround for code robustness since diagnostic context is accessed by
+    export internals via global variable. See `ExportDiagnosticEngine` for more details.
+    """
+    global _context
+    assert (
+        _context == engine.background_context
+    ), "Export context is already set. Nested export is not supported."
+    _context = engine.create_diagnostic_context(
+        "torch.onnx.export",
+        torch.__version__,
+    )
+    try:
+        yield _context
+    finally:
+        _context = engine.background_context
+
+
+def diagnose(
+    rule: infra.Rule,
+    level: infra.Level,
+    message: str | None = None,
+    frames_to_skip: int = 2,
+    **kwargs,
+) -> TorchScriptOnnxExportDiagnostic:
+    """Creates a diagnostic and record it in the global diagnostic context.
+
+    This is a wrapper around `context.log` that uses the global diagnostic
+    context.
+    """
+    diagnostic = TorchScriptOnnxExportDiagnostic(
+        rule, level, message, frames_to_skip=frames_to_skip, **kwargs
+    )
+    export_context().log(diagnostic)
+    return diagnostic
+
+
+def export_context() -> infra.DiagnosticContext:
+    global _context
+    return _context
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/_rules.py b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/_rules.py
new file mode 100644
index 0000000000000000000000000000000000000000..eebf73f26b201073423cb6923926312fb9eec07d
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/_rules.py
@@ -0,0 +1,636 @@
+# mypy: allow-untyped-defs
+"""
+GENERATED CODE - DO NOT EDIT DIRECTLY
+This file is generated by gen_diagnostics.py.
+See tools/onnx/gen_diagnostics.py for more information.
+
+Diagnostic rules for PyTorch ONNX export.
+"""
+
+import dataclasses
+from typing import Tuple
+
+# flake8: noqa
+from torch.onnx._internal.diagnostics import infra
+
+
+"""
+GENERATED CODE - DO NOT EDIT DIRECTLY
+The purpose of generating a class for each rule is to override the `format_message`
+method to provide more details in the signature about the format arguments.
+"""
+
+
+class _NodeMissingOnnxShapeInference(infra.Rule):
+    """Node is missing ONNX shape inference."""
+
+    def format_message(self, op_name) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'The shape inference of {op_name} type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function.'
+        """
+        return self.message_default_template.format(op_name=op_name)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, op_name
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'The shape inference of {op_name} type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function.'
+        """
+        return self, level, self.format_message(op_name=op_name)
+
+
+class _MissingCustomSymbolicFunction(infra.Rule):
+    """Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX."""
+
+    def format_message(self, op_name) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'ONNX export failed on an operator with unrecognized namespace {op_name}. If you are trying to export a custom operator, make sure you registered it with the right domain and version.'
+        """
+        return self.message_default_template.format(op_name=op_name)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, op_name
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'ONNX export failed on an operator with unrecognized namespace {op_name}. If you are trying to export a custom operator, make sure you registered it with the right domain and version.'
+        """
+        return self, level, self.format_message(op_name=op_name)
+
+
+class _MissingStandardSymbolicFunction(infra.Rule):
+    """Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX."""
+
+    def format_message(  # type: ignore[override]
+        self, op_name, opset_version, issue_url
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Please feel free to request support or submit a pull request on PyTorch GitHub: {issue_url}."
+        """
+        return self.message_default_template.format(
+            op_name=op_name, opset_version=opset_version, issue_url=issue_url
+        )
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, op_name, opset_version, issue_url
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Please feel free to request support or submit a pull request on PyTorch GitHub: {issue_url}."
+        """
+        return (
+            self,
+            level,
+            self.format_message(
+                op_name=op_name, opset_version=opset_version, issue_url=issue_url
+            ),
+        )
+
+
+class _OperatorSupportedInNewerOpsetVersion(infra.Rule):
+    """Operator is supported in newer opset version."""
+
+    def format_message(  # type: ignore[override]
+        self, op_name, opset_version, supported_opset_version
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Support for this operator was added in version {supported_opset_version}, try exporting with this version."
+        """
+        return self.message_default_template.format(
+            op_name=op_name,
+            opset_version=opset_version,
+            supported_opset_version=supported_opset_version,
+        )
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, op_name, opset_version, supported_opset_version
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Support for this operator was added in version {supported_opset_version}, try exporting with this version."
+        """
+        return (
+            self,
+            level,
+            self.format_message(
+                op_name=op_name,
+                opset_version=opset_version,
+                supported_opset_version=supported_opset_version,
+            ),
+        )
+
+
+class _FxGraphToOnnx(infra.Rule):
+    """Transforms graph from FX IR to ONNX IR."""
+
+    def format_message(self, graph_name) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'Transforming FX graph {graph_name} to ONNX graph.'
+        """
+        return self.message_default_template.format(graph_name=graph_name)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, graph_name
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'Transforming FX graph {graph_name} to ONNX graph.'
+        """
+        return self, level, self.format_message(graph_name=graph_name)
+
+
+class _FxNodeToOnnx(infra.Rule):
+    """Transforms an FX node to an ONNX node."""
+
+    def format_message(self, node_repr) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'Transforming FX node {node_repr} to ONNX node.'
+        """
+        return self.message_default_template.format(node_repr=node_repr)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, node_repr
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'Transforming FX node {node_repr} to ONNX node.'
+        """
+        return self, level, self.format_message(node_repr=node_repr)
+
+
+class _FxPass(infra.Rule):
+    """FX graph transformation during ONNX export before converting from FX IR to ONNX IR."""
+
+    def format_message(self, pass_name) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'Running {pass_name} pass.'
+        """
+        return self.message_default_template.format(pass_name=pass_name)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, pass_name
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'Running {pass_name} pass.'
+        """
+        return self, level, self.format_message(pass_name=pass_name)
+
+
+class _NoSymbolicFunctionForCallFunction(infra.Rule):
+    """Cannot find symbolic function to convert the "call_function" FX node to ONNX."""
+
+    def format_message(self, target) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'No symbolic function to convert the "call_function" node {target} to ONNX. '
+        """
+        return self.message_default_template.format(target=target)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, target
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'No symbolic function to convert the "call_function" node {target} to ONNX. '
+        """
+        return self, level, self.format_message(target=target)
+
+
+class _UnsupportedFxNodeAnalysis(infra.Rule):
+    """Result from FX graph analysis to reveal unsupported FX nodes."""
+
+    def format_message(  # type: ignore[override]
+        self, node_op_to_target_mapping
+    ) -> str:
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'Unsupported FX nodes: {node_op_to_target_mapping}. '
+        """
+        return self.message_default_template.format(
+            node_op_to_target_mapping=node_op_to_target_mapping
+        )
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, node_op_to_target_mapping
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'Unsupported FX nodes: {node_op_to_target_mapping}. '
+        """
+        return (
+            self,
+            level,
+            self.format_message(node_op_to_target_mapping=node_op_to_target_mapping),
+        )
+
+
+class _OpLevelDebugging(infra.Rule):
+    """Report any op level validation failure in warnings."""
+
+    def format_message(self, node, symbolic_fn) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'FX node: {node} and its onnx function: {symbolic_fn} fails on op level validation.'
+        """
+        return self.message_default_template.format(node=node, symbolic_fn=symbolic_fn)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, node, symbolic_fn
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'FX node: {node} and its onnx function: {symbolic_fn} fails on op level validation.'
+        """
+        return self, level, self.format_message(node=node, symbolic_fn=symbolic_fn)
+
+
+class _FindOpschemaMatchedSymbolicFunction(infra.Rule):
+    """Find the OnnxFunction that matches the input/attribute dtypes by comparing them with their opschemas."""
+
+    def format_message(self, symbolic_fn, node) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'The OnnxFunction: {symbolic_fn} is the nearest match of the node {node}.'
+        """
+        return self.message_default_template.format(symbolic_fn=symbolic_fn, node=node)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, symbolic_fn, node
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'The OnnxFunction: {symbolic_fn} is the nearest match of the node {node}.'
+        """
+        return self, level, self.format_message(symbolic_fn=symbolic_fn, node=node)
+
+
+class _FxNodeInsertTypePromotion(infra.Rule):
+    """Determine if type promotion is required for the FX node. Insert cast nodes if needed."""
+
+    def format_message(self, target) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'Performing explicit type promotion for node {target}. '
+        """
+        return self.message_default_template.format(target=target)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, target
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'Performing explicit type promotion for node {target}. '
+        """
+        return self, level, self.format_message(target=target)
+
+
+class _FindOperatorOverloadsInOnnxRegistry(infra.Rule):
+    """Find the list of OnnxFunction of the PyTorch operator in onnx registry."""
+
+    def format_message(self, node) -> str:  # type: ignore[override]
+        """Returns the formatted default message of this Rule.
+
+        Message template: 'Checking if the FX node: {node} is supported in onnx registry.'
+        """
+        return self.message_default_template.format(node=node)
+
+    def format(  # type: ignore[override]
+        self, level: infra.Level, node
+    ) -> Tuple[infra.Rule, infra.Level, str]:
+        """Returns a tuple of (Rule, Level, message) for this Rule.
+
+        Message template: 'Checking if the FX node: {node} is supported in onnx registry.'
+        """
+        return self, level, self.format_message(node=node)
+
+
+@dataclasses.dataclass
+class _POERules(infra.RuleCollection):
+    node_missing_onnx_shape_inference: _NodeMissingOnnxShapeInference = dataclasses.field(
+        default=_NodeMissingOnnxShapeInference.from_sarif(
+            **{
+                "id": "POE0001",
+                "name": "node-missing-onnx-shape-inference",
+                "short_description": {"text": "Node is missing ONNX shape inference."},
+                "full_description": {
+                    "text": "Node is missing ONNX shape inference. This usually happens when the node is not valid under standard ONNX operator spec.",
+                    "markdown": "Node is missing ONNX shape inference.\nThis usually happens when the node is not valid under standard ONNX operator spec.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "The shape inference of {op_name} type is missing, so it may result in wrong shape inference for the exported graph. Please consider adding it in symbolic function."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Node is missing ONNX shape inference."""
+
+    missing_custom_symbolic_function: _MissingCustomSymbolicFunction = dataclasses.field(
+        default=_MissingCustomSymbolicFunction.from_sarif(
+            **{
+                "id": "POE0002",
+                "name": "missing-custom-symbolic-function",
+                "short_description": {
+                    "text": "Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX."
+                },
+                "full_description": {
+                    "text": "Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX.",
+                    "markdown": "Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "ONNX export failed on an operator with unrecognized namespace {op_name}. If you are trying to export a custom operator, make sure you registered it with the right domain and version."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Missing symbolic function for custom PyTorch operator, cannot translate node to ONNX."""
+
+    missing_standard_symbolic_function: _MissingStandardSymbolicFunction = dataclasses.field(
+        default=_MissingStandardSymbolicFunction.from_sarif(
+            **{
+                "id": "POE0003",
+                "name": "missing-standard-symbolic-function",
+                "short_description": {
+                    "text": "Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX."
+                },
+                "full_description": {
+                    "text": "Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX.",
+                    "markdown": "Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Please feel free to request support or submit a pull request on PyTorch GitHub: {issue_url}."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Missing symbolic function for standard PyTorch operator, cannot translate node to ONNX."""
+
+    operator_supported_in_newer_opset_version: _OperatorSupportedInNewerOpsetVersion = dataclasses.field(
+        default=_OperatorSupportedInNewerOpsetVersion.from_sarif(
+            **{
+                "id": "POE0004",
+                "name": "operator-supported-in-newer-opset-version",
+                "short_description": {
+                    "text": "Operator is supported in newer opset version."
+                },
+                "full_description": {
+                    "text": "Operator is supported in newer opset version.",
+                    "markdown": "Operator is supported in newer opset version.\n\nExample:\n```python\ntorch.onnx.export(model, args, ..., opset_version=9)\n```\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Exporting the operator '{op_name}' to ONNX opset version {opset_version} is not supported. Support for this operator was added in version {supported_opset_version}, try exporting with this version."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Operator is supported in newer opset version."""
+
+    fx_graph_to_onnx: _FxGraphToOnnx = dataclasses.field(
+        default=_FxGraphToOnnx.from_sarif(
+            **{
+                "id": "FXE0007",
+                "name": "fx-graph-to-onnx",
+                "short_description": {
+                    "text": "Transforms graph from FX IR to ONNX IR."
+                },
+                "full_description": {
+                    "text": "Transforms graph from FX IR to ONNX IR.",
+                    "markdown": "This diagnostic tracks the transformation process from an FX Graph (in FX IR) to an ONNX Graph (in ONNX IR).\n\n## Key Representations:\n\n- **FX Graph**: The graph in FX IR produced by dynamo or symbolic tracing.\n- **ONNX Graph**: The graph in ONNX IR and [operators](https://onnx.ai/onnx/operators/).\n\n## Additional Notes:\n\n- Prior to this transformation step, the FX graph undergoes preprocessing through multiple FX passes.\n  To gain insight into these transformations, refer to diagnostic `FXE0010`.\n- To enable a detailed view of the graph transformation in progress within this diagnostic, switch to the DEBUG mode.\n\n  - Set DiagnosticOptions.verbosity_level to logging.DEBUG.\n  - Activate the environment variable TORCH_LOGS='onnx_diagnostics'.\n\n- For specific information related to node-level FX to ONNX transformations, explore the diagnostic `FXE0008`.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Transforming FX graph {graph_name} to ONNX graph."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Transforms graph from FX IR to ONNX IR."""
+
+    fx_node_to_onnx: _FxNodeToOnnx = dataclasses.field(
+        default=_FxNodeToOnnx.from_sarif(
+            **{
+                "id": "FXE0008",
+                "name": "fx-node-to-onnx",
+                "short_description": {"text": "Transforms an FX node to an ONNX node."},
+                "full_description": {
+                    "text": "Transforms an FX node to an ONNX node.",
+                    "markdown": "This diagnostic tracks the transformation process from an FX Node to ONNX [Operators](https://onnx.ai/onnx/operators/).\n\nThe process of converting FX Node to ONNX Node involves dealing with six distinct node types:\n  1. `placeholder`: Represents a module input, maps to an ONNX graph input.\n  2. `call_module`: Symbolizes a call to a submodule, maps to an ONNX\n  3. `call_method`: Symbolizes a method call. Not yet implemented.\n  4. `call_function`: Symbolizes a function call. [Core ATen](https://pytorch.org/docs/stable/ir.html#core-aten-ir) is expected\n    as the function call target. The mapping from ATen to ONNX is implemented by [ONNXScript torchlib](https://github.com/microsoft/onnxscript/tree/main/onnxscript/function_libs/torch_lib/ops).\n    This [guide](https://pytorch.org/docs/stable/onnx.html#onnx-script-functions) shows how to write and register a custom symbolic function for call_function FX node.\n  5. `get_attr`: Indicates an attribute access within the current module. Maps to an ONNX graph initializer.\n  6. `output`: Represents the module's output. Maps to an ONNX graph output.\n\nFor a granular understanding of how each node type is transformed, refer to the implementation details in `FxOnnxInterpreter`.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Transforming FX node {node_repr} to ONNX node."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Transforms an FX node to an ONNX node."""
+
+    fx_pass: _FxPass = dataclasses.field(
+        default=_FxPass.from_sarif(
+            **{
+                "id": "FXE0010",
+                "name": "fx-pass",
+                "short_description": {
+                    "text": "FX graph transformation during ONNX export before converting from FX IR to ONNX IR."
+                },
+                "full_description": {
+                    "text": "FX graph transformation during ONNX export before converting from FX IR to ONNX IR.",
+                    "markdown": "This diagnostic tracks the FX passes executed during the ONNX export process prior\nto converting from FX IR (Intermediate Representation) to ONNX IR.\n\nUnder the scope of ONNX export, an FX pass refers to a specific transformation applied to the FX GraphModule.\nThe primary aim of these passes is to streamline the graph into a format that aligns more with the ONNX IR.\nMoreover, these passes work to substitute unsupported FX IR features with those recognized and endorsed by\nONNX IR. Common transformations include, but aren't limited to, decomposition, functionalization and\ntype promotion.\n\nFor those who are interested in a comprehensive log detailing the modifications made during these passes,\nthere are a couple of options:\n\n- Set DiagnosticOptions.verbosity_level to logging.DEBUG.\n- Activate the environment variable TORCH_LOGS='onnx_diagnostics'.\n\nHowever, it's noteworthy that by default, such detailed logging is turned off. The primary reason being\nits considerable impact on performance.\n\nFor an in-depth understanding of each specific pass, please refer to the directory: torch/onnx/_internal/fx/passes.\n",
+                },
+                "message_strings": {"default": {"text": "Running {pass_name} pass."}},
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """FX graph transformation during ONNX export before converting from FX IR to ONNX IR."""
+
+    no_symbolic_function_for_call_function: _NoSymbolicFunctionForCallFunction = dataclasses.field(
+        default=_NoSymbolicFunctionForCallFunction.from_sarif(
+            **{
+                "id": "FXE0011",
+                "name": "no-symbolic-function-for-call-function",
+                "short_description": {
+                    "text": 'Cannot find symbolic function to convert the "call_function" FX node to ONNX.'
+                },
+                "full_description": {
+                    "text": 'Cannot find symbolic function to convert the "call_function" FX node to ONNX. ',
+                    "markdown": 'This error occurs when the ONNX converter is unable to find a corresponding symbolic function\nto convert a "call_function" node in the input graph to its equivalence in ONNX. The "call_function"\nnode represents a normalized function call in PyTorch, such as "torch.aten.ops.add".\n\nTo resolve this error, you can try one of the following:\n\n- If exists, apply the auto-fix suggested by the diagnostic. TODO: this part is not available yet.\n- Rewrite the model using only supported PyTorch operators or functions.\n- Follow this [guide](https://pytorch.org/tutorials/beginner/onnx/onnx_registry_tutorial.html#overview) to write and\n  register a custom symbolic function for the unsupported call_function FX node.\n',
+                },
+                "message_strings": {
+                    "default": {
+                        "text": 'No symbolic function to convert the "call_function" node {target} to ONNX. '
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Cannot find symbolic function to convert the "call_function" FX node to ONNX."""
+
+    unsupported_fx_node_analysis: _UnsupportedFxNodeAnalysis = dataclasses.field(
+        default=_UnsupportedFxNodeAnalysis.from_sarif(
+            **{
+                "id": "FXE0012",
+                "name": "unsupported-fx-node-analysis",
+                "short_description": {
+                    "text": "Result from FX graph analysis to reveal unsupported FX nodes."
+                },
+                "full_description": {
+                    "text": "Result from FX graph analysis to reveal unsupported FX nodes.",
+                    "markdown": "This error indicates that an FX graph contains one or more unsupported nodes. The error message\nis typically accompanied by a list of the unsupported nodes found during analysis.\n\nTo resolve this error, you can try resolving each individual unsupported node error by following\nthe suggestions by its diagnostic. Typically, options include:\n\n- If exists, apply the auto-fix suggested by the diagnostic. TODO: this part is not available yet.\n- Rewrite the model using only supported PyTorch operators or functions.\n- Follow this [guide](https://pytorch.org/docs/stable/onnx.html#onnx-script-functions) to write and\n  register a custom symbolic function for the unsupported call_function FX node.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Unsupported FX nodes: {node_op_to_target_mapping}. "
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Result from FX graph analysis to reveal unsupported FX nodes."""
+
+    op_level_debugging: _OpLevelDebugging = dataclasses.field(
+        default=_OpLevelDebugging.from_sarif(
+            **{
+                "id": "FXE0013",
+                "name": "op-level-debugging",
+                "short_description": {
+                    "text": "Report any op level validation failure in warnings."
+                },
+                "full_description": {
+                    "text": "Report any op level validation failure in warnings.",
+                    "markdown": "This warning message indicates that during op level debugging, certain symbolic functions\nhave failed to match the results of torch ops when using real tensors generated from fake\ntensors. It is important to note that the symbolic functions may not necessarily be\nincorrect, as the validation process is non-deterministic and should only be used as a\nreference.\n\nThere are two categories of warnings that can be triggered:\n\n1. Non-validated operators:\n  If the warnings are caused by the following errors, they can be disregarded by users,\n  as these errors occur due to the non-deterministic nature of the validation. However,\n  it is important to be aware that the operators have not been validated.\n\n  - IndexError: Unsupported input arguments of randomized dimensions/indices(INT64).\n  - RuntimeError: Unsupported input arguments for torch ops are generated.\n  - ValueError: Arguments/keyword arguments do not match the signature of the symbolic function.\n\n2. Potentially wrong torchlib operators:\n  If the warnings are triggered by the following error, users should be aware that the symbolic functions\n  may be incorrect in dispatching or implementation. In such cases, it is recommended to report\n  the issue to the PyTorch-ONNX team, or create/register a custom symbolic function to replace the default one.\n\n  - AssertionError: The symbolic function is potentially wrong as the results do not match the results of torch ops.\n  - TypeError: The symbolic function is potentially wrong as the opschema doesn't match inputs.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "FX node: {node} and its onnx function: {symbolic_fn} fails on op level validation."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Report any op level validation failure in warnings."""
+
+    find_opschema_matched_symbolic_function: _FindOpschemaMatchedSymbolicFunction = dataclasses.field(
+        default=_FindOpschemaMatchedSymbolicFunction.from_sarif(
+            **{
+                "id": "FXE0014",
+                "name": "find-opschema-matched-symbolic-function",
+                "short_description": {
+                    "text": "Find the OnnxFunction that matches the input/attribute dtypes by comparing them with their opschemas."
+                },
+                "full_description": {
+                    "text": "Find the OnnxFunction that matches the input dtypes by comparing them with their opschemas. A warning will be issued if the matched OnnxFunction is not an exact match.",
+                    "markdown": "When an ATen/Custom operator is registered and needs to be dispatched to an OnnxFunction, the input/attribute\ndtypes of the ATen/Custom operator are compared with the input/attribute dtypes of the OnnxFunction opschemas\nto find a match. However, if a perfect/exact match is not found, the dispatcher will attempt to find\nthe nearest match with the highest number of input/attribute dtypes matching the OnnxFunction opschemas, while\nissuing a warning.\n\nThere are two types of level that can be triggered in this rule:\n\n1. NOTE: A perfect match is found, and no warning is issued.\n2. WARNING: The matched OnnxFunction is not a perfect/exact match.\n\nHere are some suggestions based on the WARNING situation:\n\n1. If there are NO errors or mismatches in the results, it is safe to disregard this warning,\n  as the definition of OnnxFunction schema is usually more stringent.\n2. If there are errors or mismatches in the results, it is recommended to:\n  (a) Enable op_level_debugging to determine if the OnnxFunction might be incorrect.\n  (b) Report the issue to the PyTorch-ONNX team.\n  (c) Create/register a custom symbolic function to replace the default one.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "The OnnxFunction: {symbolic_fn} is the nearest match of the node {node}."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Find the OnnxFunction that matches the input/attribute dtypes by comparing them with their opschemas."""
+
+    fx_node_insert_type_promotion: _FxNodeInsertTypePromotion = dataclasses.field(
+        default=_FxNodeInsertTypePromotion.from_sarif(
+            **{
+                "id": "FXE0015",
+                "name": "fx-node-insert-type-promotion",
+                "short_description": {
+                    "text": "Determine if type promotion is required for the FX node. Insert cast nodes if needed."
+                },
+                "full_description": {
+                    "text": "Determine if type promotion is required for the FX node. Insert cast nodes if needed.",
+                    "markdown": "This diagnostic monitors the node-level type promotion insertion process. In PyTorch, there is an automatic process called implicit type promotion,\nwhere the input types of an operator are promoted to a common type. The determination of the common type is based on the type promotion rule specific to each operator.\nTo learn more about PyTorch's type promotion rules, refer to the [elementwise_dtypes doc](https://github.com/pytorch/pytorch/blob/f044613f78df713fb57f70c608483c9f10ad332e/torch/_prims_common/__init__.py#L1252-L1335)\nand [torch._refs ops](https://github.com/pytorch/pytorch/blob/a475ea4542dfe961c9d097e33ab5041f61c8c17f/torch/_refs/__init__.py#L484).\n\nHowever, implicit type promotion is not supported in ONNX. Therefore, to replicate the PyTorch behavior, we need to explicitly insert cast nodes.\nThis diagnostic tracks the process of node-level type promotion insertion.\n\nThe type promotion rules used by this process can be found in `torch/onnx/_internal/fx/passes/type_promotion.py.`\nTo update or add new type promotion rules, please refer to the [Note: Update type promotion rule] section.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Performing explicit type promotion for node {target}. "
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Determine if type promotion is required for the FX node. Insert cast nodes if needed."""
+
+    find_operator_overloads_in_onnx_registry: _FindOperatorOverloadsInOnnxRegistry = dataclasses.field(
+        default=_FindOperatorOverloadsInOnnxRegistry.from_sarif(
+            **{
+                "id": "FXE0016",
+                "name": "find-operator-overloads-in-onnx-registry",
+                "short_description": {
+                    "text": "Find the list of OnnxFunction of the PyTorch operator in onnx registry."
+                },
+                "full_description": {
+                    "text": "This rule involves finding the list of OnnxFunction for the PyTorch operator overload in the ONNX registry. If the operator overload is not supported but its default overload is, a warning will be issued. If both the operator overload and its default overload are not supported, an error will be issued.",
+                    "markdown": "The operator overload name serves the purpose of verifying whether a PyTorch operator is registered in the ONNX registry.\nIf it's not found, the dispatcher takes a fallback approach and tries to locate the default overload of the PyTorch\noperator in the registry. If even the default overload is absent, it signifies that the operator is officially unsupported.\n\nThere are three types of level that can be triggered in this rule:\n\n1. NOTE: The op overload is supported.\n2. WARNING: The op overload is not supported, but it's default overload is supported.\n3. ERROR: The op overload is not supported, and it's default overload is also not supported.\n\nHere are some suggestions based on the WARNING situation:\n\n1. If there are NO errors or mismatches in the results, it is safe to disregard this warning.\n2. If there are errors or mismatches in the results, it is recommended to:\n  (a) Enable op_level_debugging to determine if the OnnxFunction might be incorrect.\n  (b) Report the unsupported overload to the PyTorch-ONNX team.\n  (c) Create/register a custom symbolic function to replace the default one.\n\nHere are some suggestions based on the ERROR situation:\n\n1. Report the unsupported operator to the PyTorch-ONNX team.\n2. Create/register a custom symbolic function to replace the default one.\n",
+                },
+                "message_strings": {
+                    "default": {
+                        "text": "Checking if the FX node: {node} is supported in onnx registry."
+                    }
+                },
+                "help_uri": None,
+                "properties": {"deprecated": False, "tags": []},
+            }
+        ),
+        init=False,
+    )
+    """Find the list of OnnxFunction of the PyTorch operator in onnx registry."""
+
+
+rules = _POERules()
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/_infra.py b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/_infra.py
new file mode 100644
index 0000000000000000000000000000000000000000..468936391a270ff56646a9e86fb246dc013b3c43
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/_infra.py
@@ -0,0 +1,285 @@
+# mypy: allow-untyped-defs
+"""This file defines an additional layer of abstraction on top of the SARIF OM."""
+
+from __future__ import annotations
+
+import dataclasses
+import enum
+import logging
+from typing import Mapping, Sequence
+
+from torch.onnx._internal.diagnostics.infra import formatter, sarif
+
+
+class Level(enum.IntEnum):
+    """The level of a diagnostic.
+
+    This class is used to represent the level of a diagnostic. The levels are defined
+    by the SARIF specification, and are not modifiable. For alternative categories,
+    please use infra.Tag instead. When selecting a level, please consider the following
+    guidelines:
+
+    - NONE: Informational result that does not indicate the presence of a problem.
+    - NOTE: An opportunity for improvement was found.
+    - WARNING: A potential problem was found.
+    - ERROR: A serious problem was found.
+
+    This level is a subclass of enum.IntEnum, and can be used as an integer. Its integer
+    value maps to the logging levels in Python's logging module. The mapping is as
+    follows:
+
+        Level.NONE = logging.DEBUG = 10
+        Level.NOTE = logging.INFO = 20
+        Level.WARNING = logging.WARNING = 30
+        Level.ERROR = logging.ERROR = 40
+    """
+
+    NONE = 10
+    NOTE = 20
+    WARNING = 30
+    ERROR = 40
+
+
+levels = Level
+
+
+class Tag(enum.Enum):
+    """The tag of a diagnostic. This class can be inherited to define custom tags."""
+
+
+class PatchedPropertyBag(sarif.PropertyBag):
+    """Key/value pairs that provide additional information about the object.
+
+    The definition of PropertyBag via SARIF spec is "A property bag is an object (section 3.6)
+    containing an unordered set of properties with arbitrary names." However it is not
+    reflected in the json file, and therefore not captured by the python representation.
+    This patch adds additional **kwargs to the `__init__` method to allow recording
+    arbitrary key/value pairs.
+    """
+
+    def __init__(self, tags: list[str] | None = None, **kwargs):
+        super().__init__(tags=tags)
+        self.__dict__.update(kwargs)
+
+
+@dataclasses.dataclass(frozen=True)
+class Rule:
+    id: str
+    name: str
+    message_default_template: str
+    short_description: str | None = None
+    full_description: str | None = None
+    full_description_markdown: str | None = None
+    help_uri: str | None = None
+
+    @classmethod
+    def from_sarif(cls, **kwargs):
+        """Returns a rule from the SARIF reporting descriptor."""
+        short_description = kwargs.get("short_description", {}).get("text")
+        full_description = kwargs.get("full_description", {}).get("text")
+        full_description_markdown = kwargs.get("full_description", {}).get("markdown")
+        help_uri = kwargs.get("help_uri")
+
+        rule = cls(
+            id=kwargs["id"],
+            name=kwargs["name"],
+            message_default_template=kwargs["message_strings"]["default"]["text"],
+            short_description=short_description,
+            full_description=full_description,
+            full_description_markdown=full_description_markdown,
+            help_uri=help_uri,
+        )
+        return rule
+
+    def sarif(self) -> sarif.ReportingDescriptor:
+        """Returns a SARIF reporting descriptor of this Rule."""
+        short_description = (
+            sarif.MultiformatMessageString(text=self.short_description)
+            if self.short_description is not None
+            else None
+        )
+        full_description = (
+            sarif.MultiformatMessageString(
+                text=self.full_description, markdown=self.full_description_markdown
+            )
+            if self.full_description is not None
+            else None
+        )
+        return sarif.ReportingDescriptor(
+            id=self.id,
+            name=self.name,
+            short_description=short_description,
+            full_description=full_description,
+            help_uri=self.help_uri,
+        )
+
+    def format(self, level: Level, *args, **kwargs) -> tuple[Rule, Level, str]:
+        """Returns a tuple of (rule, level, message) for a diagnostic.
+
+        This method is used to format the message of a diagnostic. The message is
+        formatted using the default template of this rule, and the arguments passed in
+        as `*args` and `**kwargs`. The level is used to override the default level of
+        this rule.
+        """
+        return (self, level, self.format_message(*args, **kwargs))
+
+    def format_message(self, *args, **kwargs) -> str:
+        """Returns the formatted default message of this Rule.
+
+        This method should be overridden (with code generation) by subclasses to reflect
+        the exact arguments needed by the message template. This is a helper method to
+        create the default message for a diagnostic.
+        """
+        return self.message_default_template.format(*args, **kwargs)
+
+
+@dataclasses.dataclass
+class Location:
+    uri: str | None = None
+    line: int | None = None
+    message: str | None = None
+    start_column: int | None = None
+    end_column: int | None = None
+    snippet: str | None = None
+    function: str | None = None
+
+    def sarif(self) -> sarif.Location:
+        """Returns the SARIF representation of this location."""
+        return sarif.Location(
+            physical_location=sarif.PhysicalLocation(
+                artifact_location=sarif.ArtifactLocation(uri=self.uri),
+                region=sarif.Region(
+                    start_line=self.line,
+                    start_column=self.start_column,
+                    end_column=self.end_column,
+                    snippet=sarif.ArtifactContent(text=self.snippet),
+                ),
+            ),
+            message=sarif.Message(text=self.message)
+            if self.message is not None
+            else None,
+        )
+
+
+@dataclasses.dataclass
+class StackFrame:
+    location: Location
+
+    def sarif(self) -> sarif.StackFrame:
+        """Returns the SARIF representation of this stack frame."""
+        return sarif.StackFrame(location=self.location.sarif())
+
+
+@dataclasses.dataclass
+class Stack:
+    """Records a stack trace. The frames are in order from newest to oldest stack frame."""
+
+    frames: list[StackFrame] = dataclasses.field(default_factory=list)
+    message: str | None = None
+
+    def sarif(self) -> sarif.Stack:
+        """Returns the SARIF representation of this stack."""
+        return sarif.Stack(
+            frames=[frame.sarif() for frame in self.frames],
+            message=sarif.Message(text=self.message)
+            if self.message is not None
+            else None,
+        )
+
+
+@dataclasses.dataclass
+class ThreadFlowLocation:
+    """Records code location and the initial state."""
+
+    location: Location
+    state: Mapping[str, str]
+    index: int
+    stack: Stack | None = None
+
+    def sarif(self) -> sarif.ThreadFlowLocation:
+        """Returns the SARIF representation of this thread flow location."""
+        return sarif.ThreadFlowLocation(
+            location=self.location.sarif(),
+            state=self.state,
+            stack=self.stack.sarif() if self.stack is not None else None,
+        )
+
+
+@dataclasses.dataclass
+class Graph:
+    """A graph of diagnostics.
+
+    This class stores the string representation of a model graph.
+    The `nodes` and `edges` fields are unused in the current implementation.
+    """
+
+    graph: str
+    name: str
+    description: str | None = None
+
+    def sarif(self) -> sarif.Graph:
+        """Returns the SARIF representation of this graph."""
+        return sarif.Graph(
+            description=sarif.Message(text=self.graph),
+            properties=PatchedPropertyBag(name=self.name, description=self.description),
+        )
+
+
+@dataclasses.dataclass
+class RuleCollection:
+    _rule_id_name_set: frozenset[tuple[str, str]] = dataclasses.field(init=False)
+
+    def __post_init__(self) -> None:
+        self._rule_id_name_set = frozenset(
+            {
+                (field.default.id, field.default.name)
+                for field in dataclasses.fields(self)
+                if isinstance(field.default, Rule)
+            }
+        )
+
+    def __contains__(self, rule: Rule) -> bool:
+        """Checks if the rule is in the collection."""
+        return (rule.id, rule.name) in self._rule_id_name_set
+
+    @classmethod
+    def custom_collection_from_list(
+        cls, new_collection_class_name: str, rules: Sequence[Rule]
+    ) -> RuleCollection:
+        """Creates a custom class inherited from RuleCollection with the list of rules."""
+        return dataclasses.make_dataclass(
+            new_collection_class_name,
+            [
+                (
+                    formatter.kebab_case_to_snake_case(rule.name),
+                    type(rule),
+                    dataclasses.field(default=rule),
+                )
+                for rule in rules
+            ],
+            bases=(cls,),
+        )()
+
+
+class Invocation:
+    # TODO: Implement this.
+    # Tracks top level call arguments and diagnostic options.
+    def __init__(self) -> None:
+        raise NotImplementedError
+
+
+@dataclasses.dataclass
+class DiagnosticOptions:
+    """Options for diagnostic context.
+
+    Attributes:
+        verbosity_level: Set the amount of information logged for each diagnostics,
+            equivalent to the 'level' in Python logging module.
+        warnings_as_errors: When True, warning diagnostics are treated as error diagnostics.
+    """
+
+    verbosity_level: int = dataclasses.field(default=logging.INFO)
+    """Set the amount of information logged for each diagnostics, equivalent to the 'level' in Python logging module."""
+
+    warnings_as_errors: bool = dataclasses.field(default=False)
+    """If True, warning diagnostics are treated as error diagnostics."""
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/context.py b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/context.py
new file mode 100644
index 0000000000000000000000000000000000000000..a32614fd3c9fdf9a37171f3cc38a29699086074b
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/context.py
@@ -0,0 +1,404 @@
+# mypy: allow-untyped-defs
+"""A diagnostic context based on SARIF."""
+
+from __future__ import annotations
+
+import contextlib
+import dataclasses
+import gzip
+import logging
+from typing import Callable, Generator, Generic, Literal, Mapping, TypeVar
+from typing_extensions import Self
+
+from torch.onnx._internal.diagnostics import infra
+from torch.onnx._internal.diagnostics.infra import formatter, sarif, utils
+from torch.onnx._internal.diagnostics.infra.sarif import version as sarif_version
+
+
+# This is a workaround for mypy not supporting Self from typing_extensions.
+_Diagnostic = TypeVar("_Diagnostic", bound="Diagnostic")
+diagnostic_logger: logging.Logger = logging.getLogger(__name__)
+
+
+@dataclasses.dataclass
+class Diagnostic:
+    rule: infra.Rule
+    level: infra.Level
+    message: str | None = None
+    locations: list[infra.Location] = dataclasses.field(default_factory=list)
+    stacks: list[infra.Stack] = dataclasses.field(default_factory=list)
+    graphs: list[infra.Graph] = dataclasses.field(default_factory=list)
+    thread_flow_locations: list[infra.ThreadFlowLocation] = dataclasses.field(
+        default_factory=list
+    )
+    additional_messages: list[str] = dataclasses.field(default_factory=list)
+    tags: list[infra.Tag] = dataclasses.field(default_factory=list)
+    source_exception: Exception | None = None
+    """The exception that caused this diagnostic to be created."""
+    logger: logging.Logger = dataclasses.field(init=False, default=diagnostic_logger)
+    """The logger for this diagnostic. Defaults to 'diagnostic_logger' which has the same
+    log level setting with `DiagnosticOptions.verbosity_level`."""
+    _current_log_section_depth: int = 0
+
+    def __post_init__(self) -> None:
+        pass
+
+    def sarif(self) -> sarif.Result:
+        """Returns the SARIF Result representation of this diagnostic."""
+        message = self.message or self.rule.message_default_template
+        if self.additional_messages:
+            additional_message = "\n".join(self.additional_messages)
+            message_markdown = (
+                f"{message}\n\n## Additional Message:\n\n{additional_message}"
+            )
+        else:
+            message_markdown = message
+
+        kind: Literal["informational", "fail"] = (
+            "informational" if self.level == infra.Level.NONE else "fail"
+        )
+
+        sarif_result = sarif.Result(
+            message=sarif.Message(text=message, markdown=message_markdown),
+            level=self.level.name.lower(),  # type: ignore[arg-type]
+            rule_id=self.rule.id,
+            kind=kind,
+        )
+        sarif_result.locations = [location.sarif() for location in self.locations]
+        sarif_result.stacks = [stack.sarif() for stack in self.stacks]
+        sarif_result.graphs = [graph.sarif() for graph in self.graphs]
+        sarif_result.code_flows = [
+            sarif.CodeFlow(
+                thread_flows=[
+                    sarif.ThreadFlow(
+                        locations=[loc.sarif() for loc in self.thread_flow_locations]
+                    )
+                ]
+            )
+        ]
+        sarif_result.properties = sarif.PropertyBag(
+            tags=[tag.value for tag in self.tags]
+        )
+        return sarif_result
+
+    def with_location(self: Self, location: infra.Location) -> Self:
+        """Adds a location to the diagnostic."""
+        self.locations.append(location)
+        return self
+
+    def with_thread_flow_location(
+        self: Self, location: infra.ThreadFlowLocation
+    ) -> Self:
+        """Adds a thread flow location to the diagnostic."""
+        self.thread_flow_locations.append(location)
+        return self
+
+    def with_stack(self: Self, stack: infra.Stack) -> Self:
+        """Adds a stack to the diagnostic."""
+        self.stacks.append(stack)
+        return self
+
+    def with_graph(self: Self, graph: infra.Graph) -> Self:
+        """Adds a graph to the diagnostic."""
+        self.graphs.append(graph)
+        return self
+
+    @contextlib.contextmanager
+    def log_section(
+        self, level: int, message: str, *args, **kwargs
+    ) -> Generator[None, None, None]:
+        """
+        Context manager for a section of log messages, denoted by a title message and increased indentation.
+
+        Same api as `logging.Logger.log`.
+
+        This context manager logs the given title at the specified log level, increases the current
+        section depth for subsequent log messages, and ensures that the section depth is decreased
+        again when exiting the context.
+
+        Args:
+            level: The log level.
+            message: The title message to log.
+            *args: The arguments to the message. Use `LazyString` to defer the
+                expensive evaluation of the arguments until the message is actually logged.
+            **kwargs: The keyword arguments for `logging.Logger.log`.
+
+        Yields:
+            None: This context manager does not yield any value.
+
+        Example:
+            >>> with DiagnosticContext("DummyContext", "1.0"):
+            ...     rule = infra.Rule("RuleID", "DummyRule", "Rule message")
+            ...     diagnostic = Diagnostic(rule, infra.Level.WARNING)
+            ...     with diagnostic.log_section(logging.INFO, "My Section"):
+            ...         diagnostic.log(logging.INFO, "My Message")
+            ...         with diagnostic.log_section(logging.INFO, "My Subsection"):
+            ...             diagnostic.log(logging.INFO, "My Submessage")
+            ...     diagnostic.additional_messages
+            ['## My Section', 'My Message', '### My Subsection', 'My Submessage']
+        """
+        if self.logger.isEnabledFor(level):
+            indented_format_message = (
+                f"##{'#' * self._current_log_section_depth } {message}"
+            )
+            self.log(
+                level,
+                indented_format_message,
+                *args,
+                **kwargs,
+            )
+        self._current_log_section_depth += 1
+        try:
+            yield
+        finally:
+            self._current_log_section_depth -= 1
+
+    def log(self, level: int, message: str, *args, **kwargs) -> None:
+        """Logs a message within the diagnostic. Same api as `logging.Logger.log`.
+
+        If logger is not enabled for the given level, the message will not be logged.
+        Otherwise, the message will be logged and also added to the diagnostic's additional_messages.
+
+        The default setting for `DiagnosticOptions.verbosity_level` is `logging.INFO`. Based on this default,
+        the log level recommendations are as follows. If you've set a different default verbosity level in your
+        application, please adjust accordingly:
+
+        - logging.ERROR: Log any events leading to application failure.
+        - logging.WARNING: Log events that might result in application issues or failures, although not guaranteed.
+        - logging.INFO: Log general useful information, ensuring minimal performance overhead.
+        - logging.DEBUG: Log detailed debug information, which might affect performance when logged.
+
+        Args:
+            level: The log level.
+            message: The message to log.
+            *args: The arguments to the message. Use `LazyString` to defer the
+                expensive evaluation of the arguments until the message is actually logged.
+            **kwargs: The keyword arguments for `logging.Logger.log`.
+        """
+        if self.logger.isEnabledFor(level):
+            formatted_message = message % args
+            self.logger.log(level, formatted_message, **kwargs)
+            self.additional_messages.append(formatted_message)
+
+    def debug(self, message: str, *args, **kwargs) -> None:
+        """Logs a debug message within the diagnostic. Same api as logging.Logger.debug.
+
+        Checkout `log` for more details.
+        """
+        self.log(logging.DEBUG, message, *args, **kwargs)
+
+    def info(self, message: str, *args, **kwargs) -> None:
+        """Logs an info message within the diagnostic. Same api as logging.Logger.info.
+
+        Checkout `log` for more details.
+        """
+        self.log(logging.INFO, message, *args, **kwargs)
+
+    def warning(self, message: str, *args, **kwargs) -> None:
+        """Logs a warning message within the diagnostic. Same api as logging.Logger.warning.
+
+        Checkout `log` for more details.
+        """
+        self.log(logging.WARNING, message, *args, **kwargs)
+
+    def error(self, message: str, *args, **kwargs) -> None:
+        """Logs an error message within the diagnostic. Same api as logging.Logger.error.
+
+        Checkout `log` for more details.
+        """
+        self.log(logging.ERROR, message, *args, **kwargs)
+
+    def log_source_exception(self, level: int, exception: Exception) -> None:
+        """Logs a source exception within the diagnostic.
+
+        Invokes `log_section` and `log` to log the exception in markdown section format.
+        """
+        self.source_exception = exception
+        with self.log_section(level, "Exception log"):
+            self.log(level, "%s", formatter.lazy_format_exception(exception))
+
+    def record_python_call_stack(self, frames_to_skip: int) -> infra.Stack:
+        """Records the current Python call stack."""
+        frames_to_skip += 1  # Skip this function.
+        stack = utils.python_call_stack(frames_to_skip=frames_to_skip)
+        self.with_stack(stack)
+        if len(stack.frames) > 0:
+            self.with_location(stack.frames[0].location)
+        return stack
+
+    def record_python_call(
+        self,
+        fn: Callable,
+        state: Mapping[str, str],
+        message: str | None = None,
+        frames_to_skip: int = 0,
+    ) -> infra.ThreadFlowLocation:
+        """Records a python call as one thread flow step."""
+        frames_to_skip += 1  # Skip this function.
+        stack = utils.python_call_stack(frames_to_skip=frames_to_skip, frames_to_log=5)
+        location = utils.function_location(fn)
+        location.message = message
+        # Add function location to the top of the stack.
+        stack.frames.insert(0, infra.StackFrame(location=location))
+        thread_flow_location = infra.ThreadFlowLocation(
+            location=location,
+            state=state,
+            index=len(self.thread_flow_locations),
+            stack=stack,
+        )
+        self.with_thread_flow_location(thread_flow_location)
+        return thread_flow_location
+
+
+class RuntimeErrorWithDiagnostic(RuntimeError):
+    """Runtime error with enclosed diagnostic information."""
+
+    def __init__(self, diagnostic: Diagnostic):
+        super().__init__(diagnostic.message)
+        self.diagnostic = diagnostic
+
+
+@dataclasses.dataclass
+class DiagnosticContext(Generic[_Diagnostic]):
+    name: str
+    version: str
+    options: infra.DiagnosticOptions = dataclasses.field(
+        default_factory=infra.DiagnosticOptions
+    )
+    diagnostics: list[_Diagnostic] = dataclasses.field(init=False, default_factory=list)
+    # TODO(bowbao): Implement this.
+    # _invocation: infra.Invocation = dataclasses.field(init=False)
+    _inflight_diagnostics: list[_Diagnostic] = dataclasses.field(
+        init=False, default_factory=list
+    )
+    _previous_log_level: int = dataclasses.field(init=False, default=logging.WARNING)
+    logger: logging.Logger = dataclasses.field(init=False, default=diagnostic_logger)
+    _bound_diagnostic_type: type = dataclasses.field(init=False, default=Diagnostic)
+
+    def __enter__(self):
+        self._previous_log_level = self.logger.level
+        self.logger.setLevel(self.options.verbosity_level)
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.logger.setLevel(self._previous_log_level)
+        return None
+
+    def sarif(self) -> sarif.Run:
+        """Returns the SARIF Run object."""
+        unique_rules = {diagnostic.rule for diagnostic in self.diagnostics}
+        return sarif.Run(
+            sarif.Tool(
+                driver=sarif.ToolComponent(
+                    name=self.name,
+                    version=self.version,
+                    rules=[rule.sarif() for rule in unique_rules],
+                )
+            ),
+            results=[diagnostic.sarif() for diagnostic in self.diagnostics],
+        )
+
+    def sarif_log(self) -> sarif.SarifLog:  # type: ignore[name-defined]
+        """Returns the SARIF Log object."""
+        return sarif.SarifLog(
+            version=sarif_version.SARIF_VERSION,
+            schema_uri=sarif_version.SARIF_SCHEMA_LINK,
+            runs=[self.sarif()],
+        )
+
+    def to_json(self) -> str:
+        return formatter.sarif_to_json(self.sarif_log())
+
+    def dump(self, file_path: str, compress: bool = False) -> None:
+        """Dumps the SARIF log to a file."""
+        if compress:
+            with gzip.open(file_path, "wt") as f:
+                f.write(self.to_json())
+        else:
+            with open(file_path, "w") as f:
+                f.write(self.to_json())
+
+    def log(self, diagnostic: _Diagnostic) -> None:
+        """Logs a diagnostic.
+
+        This method should be used only after all the necessary information for the diagnostic
+        has been collected.
+
+        Args:
+            diagnostic: The diagnostic to add.
+        """
+        if not isinstance(diagnostic, self._bound_diagnostic_type):
+            raise TypeError(
+                f"Expected diagnostic of type {self._bound_diagnostic_type}, got {type(diagnostic)}"
+            )
+        if self.options.warnings_as_errors and diagnostic.level == infra.Level.WARNING:  # type: ignore[attr-defined]
+            diagnostic.level = infra.Level.ERROR  # type: ignore[attr-defined]
+        self.diagnostics.append(diagnostic)  # type: ignore[arg-type]
+
+    def log_and_raise_if_error(self, diagnostic: _Diagnostic) -> None:
+        """Logs a diagnostic and raises an exception if it is an error.
+
+        Use this method for logging non inflight diagnostics where diagnostic level is not known or
+        lower than ERROR. If it is always expected raise, use `log` and explicit
+        `raise` instead. Otherwise there is no way to convey the message that it always
+        raises to Python intellisense and type checking tools.
+
+        This method should be used only after all the necessary information for the diagnostic
+        has been collected.
+
+        Args:
+            diagnostic: The diagnostic to add.
+        """
+        self.log(diagnostic)
+        if diagnostic.level == infra.Level.ERROR:
+            if diagnostic.source_exception is not None:
+                raise diagnostic.source_exception
+            raise RuntimeErrorWithDiagnostic(diagnostic)
+
+    @contextlib.contextmanager
+    def add_inflight_diagnostic(
+        self, diagnostic: _Diagnostic
+    ) -> Generator[_Diagnostic, None, None]:
+        """Adds a diagnostic to the context.
+
+        Use this method to add diagnostics that are not created by the context.
+        Args:
+            diagnostic: The diagnostic to add.
+        """
+        self._inflight_diagnostics.append(diagnostic)
+        try:
+            yield diagnostic
+        finally:
+            self._inflight_diagnostics.pop()
+
+    def push_inflight_diagnostic(self, diagnostic: _Diagnostic) -> None:
+        """Pushes a diagnostic to the inflight diagnostics stack.
+
+        Args:
+            diagnostic: The diagnostic to push.
+
+        Raises:
+            ValueError: If the rule is not supported by the tool.
+        """
+        self._inflight_diagnostics.append(diagnostic)
+
+    def pop_inflight_diagnostic(self) -> _Diagnostic:
+        """Pops the last diagnostic from the inflight diagnostics stack.
+
+        Returns:
+            The popped diagnostic.
+        """
+        return self._inflight_diagnostics.pop()
+
+    def inflight_diagnostic(self, rule: infra.Rule | None = None) -> _Diagnostic:
+        if rule is None:
+            # TODO(bowbao): Create builtin-rules and create diagnostic using that.
+            if len(self._inflight_diagnostics) <= 0:
+                raise AssertionError("No inflight diagnostics")
+
+            return self._inflight_diagnostics[-1]
+        else:
+            for diagnostic in reversed(self._inflight_diagnostics):
+                if diagnostic.rule == rule:
+                    return diagnostic
+            raise AssertionError(f"No inflight diagnostic for rule {rule.name}")
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/decorator.py b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..e59c97264a7565fa26e4d060625508004e82e577
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/decorator.py
@@ -0,0 +1,153 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import functools
+import logging
+import traceback
+from typing import Any, Callable, Dict, Tuple
+
+from torch.onnx._internal.diagnostics import infra
+from torch.onnx._internal.diagnostics.infra import formatter, utils
+
+
+MessageFormatterType = Callable[..., str]
+
+
+def format_message_in_text(fn: Callable, *args: Any, **kwargs: Any) -> str:
+    return f"{formatter.display_name(fn)}. "
+
+
+def format_exception_in_markdown(exception: Exception) -> str:
+    msg_list = ["### Exception log", "```"]
+    msg_list.extend(
+        traceback.format_exception(type(exception), exception, exception.__traceback__)
+    )
+    msg_list.append("```")
+    return "\n".join(msg_list)
+
+
+def format_function_signature_in_markdown(
+    fn: Callable,
+    args: tuple[Any, ...],
+    kwargs: dict[str, Any],
+    format_argument: Callable[[Any], str] = formatter.format_argument,
+) -> str:
+    msg_list = [f"### Function Signature {formatter.display_name(fn)}"]
+
+    state = utils.function_state(fn, args, kwargs)
+
+    for k, v in state.items():
+        msg_list.append(f"- {k}: {format_argument(v)}")
+
+    return "\n".join(msg_list)
+
+
+def format_return_values_in_markdown(
+    return_values: Any,
+    format_argument: Callable[[Any], str] = formatter.format_argument,
+) -> str:
+    return f"{format_argument(return_values)}"
+
+
+ModifierCallableType = Callable[
+    [infra.Diagnostic, Callable, Tuple[Any, ...], Dict[str, Any], Any], None
+]
+
+
+def diagnose_call(
+    rule: infra.Rule,
+    *,
+    level: infra.Level = infra.Level.NONE,
+    diagnostic_type: type[infra.Diagnostic] = infra.Diagnostic,
+    format_argument: Callable[[Any], str] = formatter.format_argument,
+    diagnostic_message_formatter: MessageFormatterType = format_message_in_text,
+) -> Callable:
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrapper(*args, **kwargs):
+            common_error_message = "diagnose_call can only be applied to callables"
+            if not callable(fn):
+                raise AssertionError(
+                    f"{common_error_message}. Got {type(fn)} instead of callable."
+                )
+            arg0 = args[0] if len(args) > 0 else None
+            if isinstance(ctx := arg0, infra.DiagnosticContext):
+                pass
+            elif isinstance(
+                ctx := getattr(arg0, "diagnostic_context", None),
+                infra.DiagnosticContext,
+            ):
+                pass
+            else:
+                # NOTE: At decorate time, it can't tell if a callable is function or method.
+                # Technically both are regarded as function at that time.
+                raise AssertionError(
+                    f"{common_error_message}. For {fn}, "
+                    f"If it is a function, a DiagnosticContext instance must be present as "
+                    f"the first argument. "
+                    f"If it is a method, a DiagnosticContext instance must be present as "
+                    f"the attribute 'diagnostic_context' of the 'self' argument."
+                )
+
+            diag = diagnostic_type(
+                rule,
+                level,
+                diagnostic_message_formatter(fn, *args, **kwargs),
+            )
+
+            # pop the decorator frame
+            # TODO(bowbao): by default diagnostic doesn't have stack.
+            # So need to check before doing this. Make the code cleaner.
+            # Option: do not capture stack by default in diagnostic initialization.
+            stack: infra.Stack | None = None
+            if len(diag.stacks) > 0:
+                stack = diag.stacks[0]
+                stack.frames.pop(0)
+
+            # set function location
+            fn_location = utils.function_location(fn)
+            diag.locations.insert(0, fn_location)
+            # Add function location to the top of the stack.
+            if stack is not None:
+                stack.frames.insert(0, infra.StackFrame(location=fn_location))
+
+            with diag.log_section(logging.INFO, "Function Signature"):
+                diag.log(
+                    logging.INFO,
+                    "%s",
+                    formatter.LazyString(
+                        format_function_signature_in_markdown,
+                        fn,
+                        args,
+                        kwargs,
+                        format_argument,
+                    ),
+                )
+
+            return_values: Any = None
+            with ctx.add_inflight_diagnostic(diag) as diag:
+                try:
+                    return_values = fn(*args, **kwargs)
+                    with diag.log_section(logging.INFO, "Return values"):
+                        diag.log(
+                            logging.INFO,
+                            "%s",
+                            formatter.LazyString(
+                                format_return_values_in_markdown,
+                                return_values,
+                                format_argument,
+                            ),
+                        )
+                    return return_values
+                except Exception as e:
+                    diag.log_source_exception(logging.ERROR, e)
+                    diag.level = infra.Level.ERROR
+                finally:
+                    ctx.log_and_raise_if_error(diag)
+
+        return wrapper
+
+    return decorator
+
+
+# TODO(bowbao): decorator to report only when failed.
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/formatter.py b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/formatter.py
new file mode 100644
index 0000000000000000000000000000000000000000..746a75c53c8fdbddeac0afe188424f8df49c6ad1
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/formatter.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+import dataclasses
+import json
+import re
+import traceback
+from typing import Any, Callable, Union
+
+from torch._logging import LazyString
+from torch.onnx._internal.diagnostics.infra import sarif
+
+
+# A list of types in the SARIF module to support pretty printing.
+# This is solely for type annotation for the functions below.
+_SarifClass = Union[
+    sarif.SarifLog,
+    sarif.Run,
+    sarif.ReportingDescriptor,
+    sarif.Result,
+]
+
+
+def lazy_format_exception(exception: Exception) -> LazyString:
+    return LazyString(
+        lambda: "\n".join(
+            (
+                "```",
+                *traceback.format_exception(
+                    type(exception), exception, exception.__traceback__
+                ),
+                "```",
+            )
+        ),
+    )
+
+
+def snake_case_to_camel_case(s: str) -> str:
+    splits = s.split("_")
+    if len(splits) <= 1:
+        return s
+    return "".join([splits[0], *map(str.capitalize, splits[1:])])
+
+
+def camel_case_to_snake_case(s: str) -> str:
+    return re.sub(r"([A-Z])", r"_\1", s).lower()
+
+
+def kebab_case_to_snake_case(s: str) -> str:
+    return s.replace("-", "_")
+
+
+def _convert_key(
+    object: dict[str, Any] | Any, convert: Callable[[str], str]
+) -> dict[str, Any] | Any:
+    """Convert and update keys in a dictionary with "convert".
+
+    Any value that is a dictionary will be recursively updated.
+    Any value that is a list will be recursively searched.
+
+    Args:
+        object: The object to update.
+        convert: The function to convert the keys, e.g. `kebab_case_to_snake_case`.
+
+    Returns:
+        The updated object.
+    """
+    if not isinstance(object, dict):
+        return object
+    new_dict = {}
+    for k, v in object.items():
+        new_k = convert(k)
+        if isinstance(v, dict):
+            new_v = _convert_key(v, convert)
+        elif isinstance(v, list):
+            new_v = [_convert_key(elem, convert) for elem in v]
+        else:
+            new_v = v
+        if new_v is None:
+            # Otherwise unnecessarily bloated sarif log with "null"s.
+            continue
+        if new_v == -1:
+            # WAR: -1 as default value shouldn't be logged into sarif.
+            continue
+
+        new_dict[new_k] = new_v
+
+    return new_dict
+
+
+def sarif_to_json(attr_cls_obj: _SarifClass, indent: str | None = " ") -> str:
+    dict = dataclasses.asdict(attr_cls_obj)
+    dict = _convert_key(dict, snake_case_to_camel_case)
+    return json.dumps(dict, indent=indent, separators=(",", ":"))
+
+
+def format_argument(obj: Any) -> str:
+    return f"{type(obj)}"
+
+
+def display_name(fn: Callable) -> str:
+    if hasattr(fn, "__qualname__"):
+        return fn.__qualname__
+    elif hasattr(fn, "__name__"):
+        return fn.__name__
+    else:
+        return str(fn)
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/utils.py b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..388c53dd6102a8f1d21ad1e0d2b94b222ffcb39e
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/diagnostics/infra/utils.py
@@ -0,0 +1,69 @@
+from __future__ import annotations
+
+import functools
+import inspect
+import traceback
+from typing import Any, Callable, Mapping, Sequence
+
+from torch.onnx._internal.diagnostics.infra import _infra, formatter
+
+
+def python_frame(frame: traceback.FrameSummary) -> _infra.StackFrame:
+    """Returns a StackFrame for the given traceback.FrameSummary."""
+    snippet = frame.line
+
+    return _infra.StackFrame(
+        location=_infra.Location(
+            uri=frame.filename,
+            line=frame.lineno,
+            snippet=snippet,
+            function=frame.name,
+            message=snippet,
+        )
+    )
+
+
+def python_call_stack(frames_to_skip: int = 0, frames_to_log: int = 16) -> _infra.Stack:
+    """Returns the current Python call stack."""
+    if frames_to_skip < 0:
+        raise ValueError("frames_to_skip must be non-negative")
+    if frames_to_log < 0:
+        raise ValueError("frames_to_log must be non-negative")
+    frames_to_skip += 1  # Skip this function.
+    stack = _infra.Stack()
+    # Frames are returned in order of oldest to newest.
+    frames = traceback.extract_stack(limit=frames_to_skip + frames_to_log)
+    frames.reverse()
+    stack.frames = [python_frame(frame) for frame in frames[frames_to_skip:]]
+    stack.message = "Python call stack"
+    return stack
+
+
+@functools.lru_cache
+def _function_source_info(fn: Callable) -> tuple[Sequence[str], int, str | None]:
+    """Returns the source lines, line number, and source file path for the given function.
+
+    Essentially, inspect.getsourcelines() and inspect.getsourcefile() combined.
+    Caching is applied to reduce the performance impact of this function.
+    """
+    source_lines, lineno = inspect.getsourcelines(fn)
+    return source_lines, lineno, inspect.getsourcefile(fn)
+
+
+def function_location(fn: Callable) -> _infra.Location:
+    """Returns a Location for the given function."""
+    source_lines, lineno, uri = _function_source_info(fn)
+    snippet = source_lines[0].strip() if len(source_lines) > 0 else "<unknown>"
+    return _infra.Location(
+        uri=uri,
+        line=lineno,
+        snippet=snippet,
+        message=formatter.display_name(fn),
+    )
+
+
+def function_state(
+    fn: Callable, args: tuple[Any, ...], kwargs: dict[str, Any]
+) -> Mapping[str, Any]:
+    bind = inspect.signature(fn).bind(*args, **kwargs)
+    return bind.arguments
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/io_adapter.py b/.venv/Lib/site-packages/torch/onnx/_internal/io_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..00911f53194ab5051b29dea79ab5e21e7b95a806
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/io_adapter.py
@@ -0,0 +1,641 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+from typing import (
+    Any,
+    Callable,
+    Mapping,
+    Protocol,
+    runtime_checkable,
+    Sequence,
+    TYPE_CHECKING,
+)
+
+import torch
+import torch.export as torch_export
+from torch.utils import _pytree as pytree
+
+
+if TYPE_CHECKING:
+    import inspect
+
+# TODO(bowbao): Add diagnostics for IO adapters.
+
+
+@runtime_checkable
+class InputAdaptStep(Protocol):
+    """A protocol that defines a step in the input adapting process.
+
+    The input adapting process is a sequence of steps that are applied to the
+    PyTorch model inputs to transform them into the inputs format expected by the
+    exported ONNX model. Each step takes the PyTorch model inputs as arguments and
+    returns the transformed inputs.
+
+    This serves as a base formalized construct for the transformation done to model
+    input signature by any individual component in the exporter.
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]: ...
+
+
+class InputAdapter:
+    """A class that adapts the PyTorch model inputs to exported ONNX model inputs format."""
+
+    def __init__(self, steps: list[InputAdaptStep] | None = None):
+        self._steps = steps or []
+
+    def append_step(self, step: InputAdaptStep) -> None:
+        """Appends a step to the input adapt steps.
+
+        Args:
+            step: The step to append.
+        """
+        self._steps.append(step)
+
+    def apply(
+        self,
+        *model_args,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+        **model_kwargs,
+    ) -> Sequence[int | float | bool | str | torch.Tensor | torch.dtype | None]:
+        """Converts the PyTorch model inputs to exported ONNX model inputs format.
+
+        Args:
+            model_args: The PyTorch model inputs.
+            model: The PyTorch model.
+            model_kwargs: The PyTorch model keyword inputs.
+        Returns:
+            A sequence of tensors converted from PyTorch model inputs.
+        """
+        args: Sequence[Any] = model_args
+        kwargs: Mapping[str, Any] = model_kwargs
+        for step in self._steps:
+            args, kwargs = step.apply(args, kwargs, model=model)
+        assert not kwargs
+        return args
+
+
+@runtime_checkable
+class OutputAdaptStep(Protocol):
+    """A protocol that defines a step in the output adapting process.
+
+    The output adapting process is a sequence of steps that are applied to the
+    PyTorch model outputs to transform them into the outputs format produced by the
+    exported ONNX model. Each step takes the PyTorch model outputs as arguments and
+    returns the transformed outputs.
+
+    This serves as a base formalized construct for the transformation done to model
+    output signature by any individual component in the exporter.
+    """
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> Any: ...
+
+
+class OutputAdapter:
+    """A class that adapts the PyTorch model outputs to exported ONNX model outputs format."""
+
+    def __init__(self, steps: list[OutputAdaptStep] | None = None):
+        self._steps = steps or []
+
+    def append_step(self, step: OutputAdaptStep) -> None:
+        """Appends a step to the output format steps.
+
+        Args:
+            step: The step to append.
+        """
+        self._steps.append(step)
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> Sequence[torch.Tensor | int | float | bool | str]:
+        """Converts the PyTorch model outputs to exported ONNX model outputs format.
+
+        Args:
+            model_outputs: The PyTorch model outputs.
+            model: The PyTorch model.
+
+        Returns:
+            PyTorch model outputs in exported ONNX model outputs format.
+        """
+        for step in self._steps:
+            model_outputs = step.apply(model_outputs, model=model)
+        return model_outputs
+
+
+# TODO: make_fx lose stack info https://github.com/pytorch/pytorch/issues/90276
+
+
+def _replace_tuple_with_list(spec: pytree.TreeSpec) -> pytree.TreeSpec:
+    _type = list if spec.type == tuple else spec.type
+    return pytree.TreeSpec(
+        _type, spec.context, list(map(_replace_tuple_with_list, spec.children_specs))
+    )
+
+
+def _open_top_level_list_if_single_element(spec: pytree.TreeSpec) -> pytree.TreeSpec:
+    if spec.type == list and spec.num_children == 1:
+        return spec.children_specs[0]
+    return spec
+
+
+def _assert_identical_pytree_spec(
+    spec1: pytree.TreeSpec, spec2: pytree.TreeSpec, error_message: str
+) -> None:
+    """Assert the two `TreeSpec` objects are identical.
+
+    Args:
+        spec1: The first `TreeSpec` object.
+        spec2: The second `TreeSpec` object.
+        error_message: The error message to raise if the two `TreeSpec` objects are not
+            identical.
+
+    Raises:
+        ValueError: If the two `TreeSpec` objects are not identical.
+    """
+    # TODO(bowbao): Turn this check into diagnostic. Consider warning instead of error.
+    pass_if_any_checks: Sequence[Callable[[], bool]] = [
+        lambda: spec1 == spec2,
+        # FIXME: Bug in `dynamo.export`. Sometimes outputs returned in 'list' instead of 'tuple'.
+        lambda: _replace_tuple_with_list(spec1) == _replace_tuple_with_list(spec2),
+        # FIXME: Bug in `dynamo.export`. Sometimes single function return is wrapped in list.
+        lambda: _open_top_level_list_if_single_element(spec1) == spec2,
+        lambda: spec1 == _open_top_level_list_if_single_element(spec2),
+    ]
+
+    if not any(check() for check in pass_if_any_checks):
+        raise ValueError(f"{error_message}\nExpect {spec1}.\nActual {spec2}.")
+
+
+class BindInputStep(InputAdaptStep):
+    """Bind the input arguments to the model signature."""
+
+    def __init__(self, model_signature: inspect.Signature):
+        self._model_signature = model_signature
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Bind the input arguments to the model signature.
+
+        We hope the input kwargs will be mapped to bound.args after binding.
+        If not, we will raise an error.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs. args is always empty.
+
+        Raises:
+            ValueError: If there are keyword-only arguments left after binding args and
+                kwargs to model signature.
+        """
+        bound = self._model_signature.bind(*model_args, **model_kwargs)
+        bound.apply_defaults()
+
+        # keyword-only arguments are not handled.
+        # bound.kwargs only contains keyword-only arguments after calling
+        # bind & apply_defaults, so we raise if it's not empty.
+        if bound.kwargs:
+            raise ValueError("Keyword-only arguments are not supported.")
+        return (), bound.arguments
+
+
+class MergeKwargsIntoArgsInputStep(InputAdaptStep):
+    """Merge the input kwargs into the input args."""
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Merge the input kwargs into the input args.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs. kwargs is always empty.
+        """
+        return tuple(model_args) + tuple(model_kwargs.values()), {}
+
+
+class LiftParametersAndBuffersIntoArgsInputStep(InputAdaptStep):
+    """Append parameters and buffers to model's positional argument list."""
+
+    def __init__(self, inputs: tuple[torch.Tensor, ...]) -> None:
+        self.inputs = inputs
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Append model's parameters and buffers into its input.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args + appended inputs and kwargs.
+        """
+        return (*model_args, *self.inputs), model_kwargs
+
+
+class ConvertComplexToRealRepresentationInputStep(InputAdaptStep):
+    """Convert complex dtype tensors to real representation tensors.
+
+    ONNX does not support complex dtype tensors. Thus, we convert complex dtype tensors
+    to real representation tensors (i.e., float dtype tensors with an extra dimension
+    representing the real and imaginary parts of the complex number).
+
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Convert complex tensors to float tensors.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs.
+        """
+        return (
+            tuple(
+                torch.view_as_real(arg.resolve_conj())
+                if isinstance(arg, torch.Tensor) and arg.is_complex()
+                else arg
+                for arg in model_args
+            ),
+            model_kwargs,
+        )
+
+
+class RemoveNoneInputStep(InputAdaptStep):
+    """Remove `None` from arguments.
+
+    This adapt step assumes ``model_kwargs`` is empty. It also assumes ``model_args``
+    is flattened, i.e. it does not check `None` inside nested collections.
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Remove `None` from arguments.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs.
+
+        Raises:
+            ValueError: If `model_kwargs` is not empty.
+        """
+        assert not model_kwargs
+        return tuple(arg for arg in model_args if arg is not None), {}
+
+
+class RemoveNonTensorInputStep(InputAdaptStep):
+    """Remove the non-tensor input arguments.
+
+    Dynamo does not support non-tensor input arguments (https://github.com/pytorch/pytorch/issues/99534).
+
+    Specifically, it does put the input into graph with an empty node, but consumed by no ones.
+    The concrete value is embedded into the graph as a constant arg of a target node. Meta
+    suggests in this case that one should rewrite the model code to make it tensor if the
+    input value is supposed to change at runtime. We might need to further investigate
+    the feasibility of that suggestion.
+
+    For example,
+
+        def func(x, b=1.0):
+            y = x + b
+            z = y.relu()
+            return (y, z)
+
+        x = torch.randn(1, 1, 2, dtype=torch.float32)
+        gm_fun, _ = dynamo.export(func, x, b=8.0, aten_graph=True, tracing_mode="real")
+
+        # class GraphModule(torch.nn.Module):
+        #     def forward(self, x, b):
+        #         arg0: f32[1, 1, 2], arg1, = fx_pytree.tree_flatten_spec(([x, b], {}), self._in_spec)
+        #         # File: path/to/pytorch/test_constant_input.py:5, code: y = x + b
+        #         add_tensor: f32[1, 1, 2] = torch.ops.aten.add.Tensor(arg0, 8.0);  arg0 = None
+
+        #         # File: path/to/pytorch/test_constant_input.py:6, code: z = y.relu()
+        #         relu_default: f32[1, 1, 2] = torch.ops.aten.relu.default(add_tensor)
+        #         return pytree.tree_unflatten([add_tensor, relu_default], self._out_spec)
+
+    Empty torch.fx.Node input leading to a mismatched number of input with PyTorch, as
+    it's ignored in ONNX graph. Thus, we delete the useless input here.
+
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Remove Constant from arguments.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs.
+
+        Raises:
+            ValueError: If `model_kwargs` is not empty.
+        """
+        assert not model_kwargs
+        return (
+            tuple(
+                arg
+                for arg in model_args
+                if not isinstance(arg, (int, float, bool, str))
+            ),
+            {},
+        )
+
+
+class FlattenInputWithTreeSpecValidationInputStep(InputAdaptStep):
+    """Flatten nested collection types and return a flat list of elements.
+
+    ONNX can't represent collection types (e.g., dictionary, tuple of tuple of tensor,
+    etc).
+
+    This class stores the `SpecTree` output produced when `adapt` was called the first
+    time. It then validates the `SpecTree` output produced from later `adapt` calls.
+    """
+
+    _spec: pytree.TreeSpec | None = None
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Flatten the model args and kwargs and validate the `SpecTree` output.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the flattened model args and kwargs. The kwargs is empty, because
+            they are flattened and merged into the args.
+
+        Raises:
+            ValueError: If the `SpecTree` output produced from the current `model_outputs`
+                is not identical to the `SpecTree` output produced from the first
+                `model_outputs` that was passed to this method.
+        """
+        flattened_args, spec = pytree.tree_flatten((model_args, model_kwargs))
+        if self._spec is None:
+            self._spec = spec
+        else:
+            _assert_identical_pytree_spec(
+                self._spec,
+                spec,
+                error_message="Model inputs incompatible with the format that was exported. ",
+            )
+        return flattened_args, {}
+
+
+class FlattenOutputStep(OutputAdaptStep):
+    """Flatten nested collection types and return a flat list of elements.
+
+    ONNX can't represent collection types (e.g., dictionary, tuple of tuple of tensor,
+    etc).
+
+    NOTE: Ideally we would want to use ``FlattenOutputWithTreeSpecValidationOutputStep``, such
+    that `SpecTree` can be validate for new model outputs. However, this is not possible
+    currently because we never have access to real PyTorch model outputs during export.
+    Only traced outputs may be available, but they are not an accurate reflection of the
+    original PyTorch model outputs format as they are typically in their own unique format,
+    depending on the tracing strategy.
+    """
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> Sequence[Any]:
+        """Flatten the model outputs.
+
+        Args:
+            model_outputs: The model outputs to flatten.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the flattened model outputs.
+        """
+        return pytree.tree_leaves(model_outputs)
+
+
+class ConvertComplexToRealRepresentationOutputStep(OutputAdaptStep):
+    """Convert complex dtype tensors to real representation tensors.
+
+    ONNX does not support complex dtype tensors. Thus, we convert complex dtype tensors
+    to real representation tensors (i.e., float dtype tensors with an extra dimension
+    representing the real and imaginary parts of the complex number).
+
+    """
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> Any:
+        """Convert float tensors to complex tensors.
+
+        Args:
+            model_output: The model output.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model output.
+        """
+        return [
+            torch.view_as_real(output.resolve_conj())
+            if isinstance(output, torch.Tensor) and torch.is_complex(output)
+            else output
+            for output in model_outputs
+        ]
+
+
+class FlattenOutputWithTreeSpecValidationOutputStep(OutputAdaptStep):
+    """Same as ``FlattenOutputStep``, with additional `TreeSpec` validation.
+
+    This class stores the `SpecTree` output produced when `adapt` was called the first
+    time. It then validates the `SpecTree` output produced from later `adapt` calls.
+    """
+
+    _spec: pytree.TreeSpec | None = None
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> Sequence[Any]:
+        """Flatten the model outputs and validate the `SpecTree` output.
+
+        Args:
+            model_outputs: The model outputs to flatten.
+            model: The PyTorch model.
+
+        Returns:
+            flattened_outputs: The flattened model outputs.
+
+        Raises:
+            ValueError: If the `SpecTree` output produced from the current `model_outputs`
+                is not identical to the `SpecTree` output produced from the first
+                `model_outputs` that was passed to this method.
+        """
+        flattened_outputs, spec = pytree.tree_flatten(model_outputs)
+        if self._spec is None:
+            self._spec = spec
+        else:
+            _assert_identical_pytree_spec(
+                self._spec,
+                spec,
+                error_message="Model outputs incompatible with the format that was exported. ",
+            )
+        return flattened_outputs
+
+
+class PrependParamsBuffersConstantAotAutogradInputStep(InputAdaptStep):
+    """Prepend model parameters, buffers and constants to the user input.
+
+    :func:`torch.export.export` lifts model parameters, buffers and constants as model input, thus, they
+    must be added to the user input before the model is executed.
+
+    Args:
+        model: The PyTorch model with embedded parameters and buffers.
+    """
+
+    def apply(
+        self,
+        model_args: Sequence[Any],
+        model_kwargs: Mapping[str, Any],
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> tuple[Sequence[Any], Mapping[str, Any]]:
+        """Convert complex tensors to float tensors.
+
+        Args:
+            model_args: The model args.
+            model_kwargs: The model kwargs.
+            model: The PyTorch model.
+
+        Returns:
+            A tuple of the model args and kwargs.
+        """
+        ordered_params = tuple(
+            model.state_dict[name]  # type: ignore[union-attr,index]
+            for name in model.graph_signature.parameters  # type: ignore[union-attr]
+        )
+        non_persistent_buffers = set(model.graph_signature.non_persistent_buffers)  # type: ignore[union-attr]
+        ordered_buffers = []
+        for name in model.graph_signature.buffers:  # type: ignore[union-attr]
+            if name in non_persistent_buffers:
+                ordered_buffers.append(model.constants[name])  # type: ignore[union-attr]
+            else:
+                ordered_buffers.append(model.state_dict[name])  # type: ignore[union-attr,index]
+        ordered_constant_tensors = tuple(
+            model.constants[fqn]  # type: ignore[union-attr,index]
+            for fqn in model.graph_signature.lifted_tensor_constants  # type: ignore[union-attr]
+        )
+
+        # NOTE: calling convention is first params, then buffers, then args as user supplied them.
+        # See: torch/_functorch/aot_autograd.py#L1034
+        updated_args = (
+            *ordered_params,
+            *ordered_buffers,
+            *ordered_constant_tensors,
+            *model_args,
+        )
+        if model_kwargs:
+            return MergeKwargsIntoArgsInputStep().apply(
+                updated_args, model_kwargs, model=model
+            )
+        return updated_args, {}
+
+
+class PrependParamsAndBuffersAotAutogradOutputStep(OutputAdaptStep):
+    """Prepend model's mutated buffers to the user output.
+
+    :func:`torch.export.export` lifts model's mutated buffers as outputs, thus, they
+    must be added to the user output after the model is executed.
+
+    Args:
+        model: The PyTorch model with mutated buffers.
+    """
+
+    def apply(
+        self,
+        model_outputs: Any,
+        model: torch.nn.Module | Callable | torch_export.ExportedProgram | None = None,
+    ) -> Sequence[Any]:
+        """Flatten the model outputs and validate the `SpecTree` output.
+
+        Args:
+            model_outputs: The model outputs to flatten.
+            model: The PyTorch model.
+
+        Returns:
+            flattened_outputs: The flattened model outputs.
+        """
+
+        assert isinstance(
+            model, torch_export.ExportedProgram
+        ), "'model' must be torch_export.ExportedProgram"
+        ordered_buffers = tuple(
+            model.state_dict[name]
+            if name in model.state_dict
+            else model.constants[name]
+            for name in model.graph_signature.buffers_to_mutate.values()
+        )
+
+        # NOTE: calling convention is first mutated buffers, then outputs args as model returned them.
+        updated_outputs = (*ordered_buffers, *model_outputs)
+        return updated_outputs
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/jit_utils.py b/.venv/Lib/site-packages/torch/onnx/_internal/jit_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..00bfa899e19e43e1a5ac11eeeb493a7db4d7144c
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/jit_utils.py
@@ -0,0 +1,373 @@
+# mypy: allow-untyped-defs
+"""Utilities for manipulating the torch.Graph object and the torchscript."""
+
+# TODO(justinchuby): Move more of the symbolic helper functions here and expose
+# them to the user.
+
+from __future__ import annotations
+
+import dataclasses
+import re
+import typing
+from typing import Any, Iterable, Sequence
+
+import torch
+from torch import _C
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import registration
+
+
+_ATTR_PATTERN = re.compile("^(.+)_(([ifstgz])|(ty))$")
+_SKIP_NODE_ATTRIBUTES = {"inplace", "aten"}
+
+
+@dataclasses.dataclass
+class GraphContext:
+    """Extra context for symbolic functions with all methods from torch.Graph.
+
+    NOTE: This class is not meant for external consumption. Please do not depend on
+    it outside of torch.onnx as the interface may evolve.
+
+    Attributes:
+        graph: The _C.Graph being constructed.
+        block: The current _C.Block being constructed.
+        opset: The opset version.
+        original_node: Current node that is being converted from.
+        params_dict: Mapping from graph initializer name to IValue.
+        env: Mapping from Torch domain graph Value to ONNX domain graph Value.
+        values_in_env: Set of all values in env, for constant-time lookups.
+        new_nodes: List that tracks all new nodes that are added (used to make
+            sure metadata is propagated to all new nodes).
+    """
+
+    graph: _C.Graph
+    block: _C.Block
+    opset: int
+    original_node: _C.Node
+    params_dict: dict[str, _C.IValue]
+    env: dict[_C.Value, _C.Value]
+    values_in_env: set[_C.Value]
+    new_nodes: list[_C.Node] = dataclasses.field(default_factory=list)
+
+    # Relay methods from _C.Graph for compatibility with symbolic functions that expect
+    # a _C.Graph
+    def __getattr__(self, name: str) -> Any:
+        return getattr(self.graph, name)
+
+    def op(
+        self,
+        opname: str,
+        *raw_args: torch.Tensor | _C.Value,
+        outputs: int = 1,
+        **kwargs,
+    ):
+        """Creates an ONNX operator "opname", taking "raw_args" as inputs and "kwargs" as attributes.
+
+        The set of operators and the inputs/attributes they take
+        is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
+
+        Args:
+            opname: The ONNX operator name, e.g., `Abs` or `Add`, or an operator qualified
+                with a namespace, e.g., `aten::add`.
+            raw_args: The inputs to the operator; usually provided
+                as arguments to the `symbolic` definition.
+            outputs: The number of outputs this operator returns.
+                By default an operator is assumed to return a single output.
+                If `outputs` is greater than one, this functions returns a tuple
+                of output `Value`, representing each output of the ONNX operator
+                in order.
+            kwargs: The attributes of the ONNX operator, whose keys are named
+                according to the following convention: `alpha_f` indicates
+                the `alpha` attribute with type `f`.  The valid type specifiers are
+                `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
+                specified with type float accepts either a single float, or a
+                list of floats (e.g., you would say `dims_i` for a `dims` attribute
+                that takes a list of integers).
+
+        Returns:
+            The value representing the single output of this operator (see the `outputs`
+            keyword argument for multi-return nodes).
+        """
+        # FIXME(justinchuby): Add the return type back once we know how to handle mypy
+        return _add_op(self, opname, *raw_args, outputs=outputs, **kwargs)
+
+    def aten_op(self, operator: str, *args, overload_name: str = "", **kwargs):
+        """Generates an ONNX ATen op node.
+
+        This function is for backward compatibility with the old symbolic functions.
+        """
+        return self.op(
+            "aten::ATen",
+            *args,
+            operator_s=operator,
+            overload_name_s=overload_name,
+            **kwargs,
+        )
+
+    # NOTE: For backward compatibility with the old symbolic functions.
+    # We are probably going to remove this only after the fx exporter is established.
+    at = aten_op
+
+    def onnxscript_op(
+        self,
+        onnx_fn,
+        *raw_args: torch.Tensor | _C.Value,
+        outputs: int = 1,
+        **kwargs,
+    ):
+        """Creates an ONNX operator from onnx-script function, taking "raw_args" as inputs and "kwargs" as attributes.
+
+        onnx-script repository: https://github.com/microsoft/onnx-script
+
+        Args:
+            onnx_fn: ONNXFunction from onnx-script; An example can be found at
+                https://github.com/microsoft/onnx-script#example
+            raw_args: The inputs to the operator; usually provided
+                as arguments to the `symbolic` definition.
+            outputs: The number of outputs this operator returns.
+                By default an operator is assumed to return a single output.
+                If `outputs` is greater than one, this functions returns a tuple
+                of output `Value`, representing each output of the ONNX operator
+                in order.
+            kwargs: The attributes of the ONNX operator, whose keys are named
+                according to the following convention: `alpha_f` indicates
+                the `alpha` attribute with type `f`.  The valid type specifiers are
+                `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
+                specified with type float accepts either a single float, or a
+                list of floats (e.g., you would say `dims_i` for a `dims` attribute
+                that takes a list of integers).
+
+        Returns:
+            The value representing the single output of this operator (see the `outputs`
+            keyword argument for multi-return nodes).
+        """
+        # NOTE(titaiwang): This is using class attributes, and it needs to be updated
+        # if onnx-script makes any change on these.
+        symbolic_name = f"{onnx_fn.opset.domain}::{onnx_fn.name}"
+        opset_version = onnx_fn.opset.version
+
+        registration.custom_onnx_symbolic(symbolic_name, opset_version)(onnx_fn)
+
+        return _add_op(self, symbolic_name, *raw_args, outputs=outputs, **kwargs)
+
+
+def add_op_with_blocks(
+    graph_context: GraphContext,
+    opname: str,
+    *inputs: _C.Value,
+    outputs: int = 1,
+    n_blocks: int = 1,
+    **attributes,
+) -> tuple[Any, tuple[GraphContext, ...], _C.Node]:
+    """Creates an ONNX operator "opname", taking inputs and attributes.
+
+    Args:
+        graph_context: The context for the current graph.
+        opname: The ONNX operator name, e.g., `Abs` or `Add`, or an operator qualified
+            with a namespace, e.g., `aten::add`.
+        inputs: The inputs to the operator.
+        outputs: The number of outputs this operator returns.
+            By default an operator is assumed to return a single output.
+            If `outputs` is greater than one, this functions returns a tuple
+            of output `Value`, representing each output of the ONNX operator
+            in order.
+        n_blocks: The number of sub-blocks to create in the node.
+        attributes: The attributes of the ONNX operator.
+
+    Returns:
+        A tuple of (output_values, new_contexts, node) where:
+            output_values: One or more output value of this operator
+                (see the `outputs` keyword argument for multi-return nodes).
+            new_contexts: A tuple of new graph contexts for each sub-block.
+            node: The node representing the operator.
+    """
+
+    output_values = graph_context.op(opname, *inputs, outputs=outputs, **attributes)
+    if isinstance(output_values, Sequence):
+        node = output_values[0].node()
+    else:
+        node = output_values.node()
+
+    new_contexts = []
+    for _ in range(n_blocks):
+        new_block = node.addBlock()
+        # Create shallow copy of the graph context and update the block
+        new_context = dataclasses.replace(graph_context, block=new_block)
+        new_contexts.append(new_context)
+
+    return output_values, tuple(new_contexts), node
+
+
+def _add_op(
+    graph_context: GraphContext,
+    opname: str,
+    *args: torch.Tensor | _C.Value,
+    outputs: int = 1,
+    **kwargs,
+):
+    """Creates an ONNX operator "opname", taking "args" as inputs and attributes "kwargs".
+
+    The set of operators and the inputs/attributes they take
+    is documented at https://github.com/onnx/onnx/blob/master/docs/Operators.md
+
+    This function is monkey-patched onto Graph.
+
+    Args:
+        graph_context: The Torch Graph or Block.
+        opname: The ONNX operator name, e.g., `Abs` or `Add`, or an operator qualified
+            with a namespace, e.g., `aten::add`.
+        args: The inputs to the operator; usually provided
+            as arguments to the `symbolic` definition.
+        outputs: The number of outputs this operator returns.
+            By default an operator is assumed to return a single output.
+            If `outputs` is greater than one, this functions returns a tuple
+            of output `Value`, representing each output of the ONNX operator
+            in order.
+        kwargs: The attributes of the ONNX operator, whose keys are named
+            according to the following convention: `alpha_f` indicates
+            the `alpha` attribute with type `f`.  The valid type specifiers are
+            `f` (float), `i` (int), `s` (string) or `t` (Tensor).  An attribute
+            specified with type float accepts either a single float, or a
+            list of floats (e.g., you would say `dims_i` for a `dims` attribute
+            that takes a list of integers).
+
+    Returns:
+        (Union[_C.Value, Tuple[_C.Value, ...]])
+        The value representing the single output of this operator (see the `outputs`
+        keyword argument for multi-return nodes).
+    """
+    inputs = [_const_if_tensor(graph_context, arg) for arg in args]
+    # Filter out None attributes, this can be convenient client side because
+    # now they can pass through None attributes, and have them not show up
+    attributes = {k: v for k, v in kwargs.items() if v is not None}
+
+    if "::" not in opname:
+        opname = "onnx::" + opname
+
+    node = _create_node(
+        graph_context.block,
+        opname,
+        inputs,
+        attributes,
+        params_dict=graph_context.params_dict,
+        opset_version=graph_context.opset,
+        n_outputs=outputs,
+        shape_inference=GLOBALS.onnx_shape_inference,
+    )
+    graph_context.new_nodes.append(node)
+
+    if outputs == 1:
+        return node.output()
+    return tuple(node.outputs())
+
+
+def _const_if_tensor(graph_context: GraphContext, arg):
+    if arg is None:
+        return arg
+    if isinstance(arg, _C.Value):
+        return arg
+
+    return _add_op(graph_context, "onnx::Constant", value_z=arg)
+
+
+def _create_node(
+    graph_or_block: _C.Graph | _C.Block,
+    domain_op: str,
+    inputs: Sequence,
+    attributes: dict,
+    params_dict: dict,
+    opset_version: int,
+    n_outputs: int,
+    shape_inference: bool = True,
+) -> _C.Node:
+    """Creates an node 'domain_op', taking inputs and attributes."""
+    if isinstance(graph_or_block, _C.Graph):
+        graph = graph_or_block
+        node = graph.create(domain_op, inputs, n_outputs)
+        node = graph.insertNode(node)
+    elif isinstance(graph_or_block, _C.Block):
+        block = graph_or_block
+        node = block.addNode(domain_op, inputs)
+
+        # Block does not have create defined, so we need to add outputs manually
+        if n_outputs > 1:
+            for _ in range(1, n_outputs):
+                node.addOutput()
+
+    node_outputs = tuple(node.outputs())  # type: ignore[possibly-undefined]
+    assert len(node_outputs) == n_outputs
+
+    aten = domain_op.startswith("aten::")
+
+    # Add all attributes
+    for key, value in sorted(attributes.items()):
+        if key in _SKIP_NODE_ATTRIBUTES:
+            continue
+        _add_attribute(node, key, value, aten=aten)
+    if shape_inference:
+        _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
+    return node
+
+
+def _is_onnx_list(value):
+    return isinstance(value, Iterable) and not isinstance(
+        value, (str, bytes, torch.Tensor)
+    )
+
+
+def _scalar(x: torch.Tensor):
+    """Convert a scalar tensor into a Python value."""
+    assert x.numel() == 1
+    return x[0]
+
+
+def _add_attribute(node: _C.Node, key: str, value: Any, aten: bool):
+    r"""Initializes the right attribute based on type of value."""
+    m = _ATTR_PATTERN.match(key)
+    if m is None:
+        raise ValueError(
+            f"Invalid attribute specifier '{key}' names "
+            "must be suffixed with type, e.g. 'dim_i' or 'dims_i'"
+        )
+    name, kind = m.group(1), m.group(2)
+    if _is_onnx_list(value):
+        kind += "s"
+
+    return getattr(node, f"{kind}_")(name, value)
+
+
+# TODO: Expose this to user when migrating symbolic helper functions to here.
+def _is_tensor(x: _C.Value) -> bool:
+    return x.type().isSubtypeOf(_C.TensorType.get())
+
+
+def get_device_from_value(value: _C.Value) -> torch.device | None:
+    if not _is_tensor(value):
+        return None
+    tensor_type = typing.cast(_C.TensorType, value.type())
+    return tensor_type.device()
+
+
+def parse_node_kind(kind: str) -> tuple[str, str]:
+    """Parse node kind into domain and Op name."""
+    if "::" not in kind:
+        raise ValueError(f"Node kind: {kind} is invalid. '::' is not in node kind.")
+    domain, opname = kind.split("::", 1)
+    if "::" in opname:
+        raise ValueError(f"Node kind: {kind} is invalid. '::' should only apear once.")
+    return domain, opname
+
+
+def is_aten(domain: str) -> bool:
+    """Check if the domain is official."""
+    return domain == "aten"
+
+
+def is_prim(domain: str) -> bool:
+    """Check if the domain is official."""
+    return domain == "prim"
+
+
+def is_onnx(domain: str) -> bool:
+    """Check if the domain is official."""
+    return domain == "onnx"
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/onnx_proto_utils.py b/.venv/Lib/site-packages/torch/onnx/_internal/onnx_proto_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..65b9e3e52fbf38416fa32ed23f8a563f09489ac0
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/onnx_proto_utils.py
@@ -0,0 +1,283 @@
+# mypy: allow-untyped-defs
+"""Utilities for manipulating the onnx and onnx-script dependencies and ONNX proto."""
+
+from __future__ import annotations
+
+import glob
+import io
+import os
+import shutil
+import zipfile
+from typing import Any, Mapping
+
+import torch
+import torch.jit._trace
+import torch.serialization
+from torch.onnx import _constants, _exporter_states, errors
+from torch.onnx._internal import jit_utils, registration
+
+
+def export_as_test_case(
+    model_bytes: bytes, inputs_data, outputs_data, name: str, dir: str
+) -> str:
+    """Export an ONNX model as a self contained ONNX test case.
+
+    The test case contains the model and the inputs/outputs data. The directory structure
+    is as follows:
+
+    dir
+    \u251c\u2500\u2500 test_<name>
+    \u2502   \u251c\u2500\u2500 model.onnx
+    \u2502   \u2514\u2500\u2500 test_data_set_0
+    \u2502       \u251c\u2500\u2500 input_0.pb
+    \u2502       \u251c\u2500\u2500 input_1.pb
+    \u2502       \u251c\u2500\u2500 output_0.pb
+    \u2502       \u2514\u2500\u2500 output_1.pb
+
+    Args:
+        model_bytes: The ONNX model in bytes.
+        inputs_data: The inputs data, nested data structure of numpy.ndarray.
+        outputs_data: The outputs data, nested data structure of numpy.ndarray.
+
+    Returns:
+        The path to the test case directory.
+    """
+    try:
+        import onnx
+    except ImportError as exc:
+        raise ImportError(
+            "Export test case to ONNX format failed: Please install ONNX."
+        ) from exc
+
+    test_case_dir = os.path.join(dir, "test_" + name)
+    os.makedirs(test_case_dir, exist_ok=True)
+    _export_file(
+        model_bytes,
+        os.path.join(test_case_dir, "model.onnx"),
+        _exporter_states.ExportTypes.PROTOBUF_FILE,
+        {},
+    )
+    data_set_dir = os.path.join(test_case_dir, "test_data_set_0")
+    if os.path.exists(data_set_dir):
+        shutil.rmtree(data_set_dir)
+    os.makedirs(data_set_dir)
+
+    proto = onnx.load_model_from_string(model_bytes)  # type: ignore[attr-defined]
+
+    for i, (input_proto, input) in enumerate(zip(proto.graph.input, inputs_data)):
+        export_data(input, input_proto, os.path.join(data_set_dir, f"input_{i}.pb"))
+    for i, (output_proto, output) in enumerate(zip(proto.graph.output, outputs_data)):
+        export_data(output, output_proto, os.path.join(data_set_dir, f"output_{i}.pb"))
+
+    return test_case_dir
+
+
+def load_test_case(dir: str) -> tuple[bytes, Any, Any]:
+    """Load a self contained ONNX test case from a directory.
+
+    The test case must contain the model and the inputs/outputs data. The directory structure
+    should be as follows:
+
+    dir
+    \u251c\u2500\u2500 test_<name>
+    \u2502   \u251c\u2500\u2500 model.onnx
+    \u2502   \u2514\u2500\u2500 test_data_set_0
+    \u2502       \u251c\u2500\u2500 input_0.pb
+    \u2502       \u251c\u2500\u2500 input_1.pb
+    \u2502       \u251c\u2500\u2500 output_0.pb
+    \u2502       \u2514\u2500\u2500 output_1.pb
+
+    Args:
+        dir: The directory containing the test case.
+
+    Returns:
+        model_bytes: The ONNX model in bytes.
+        inputs: the inputs data, mapping from input name to numpy.ndarray.
+        outputs: the outputs data, mapping from output name to numpy.ndarray.
+    """
+    try:
+        import onnx
+        from onnx import numpy_helper  # type: ignore[attr-defined]
+    except ImportError as exc:
+        raise ImportError(
+            "Load test case from ONNX format failed: Please install ONNX."
+        ) from exc
+
+    with open(os.path.join(dir, "model.onnx"), "rb") as f:
+        model_bytes = f.read()
+
+    test_data_dir = os.path.join(dir, "test_data_set_0")
+
+    inputs = {}
+    input_files = glob.glob(os.path.join(test_data_dir, "input_*.pb"))
+    for input_file in input_files:
+        tensor = onnx.load_tensor(input_file)  # type: ignore[attr-defined]
+        inputs[tensor.name] = numpy_helper.to_array(tensor)
+    outputs = {}
+    output_files = glob.glob(os.path.join(test_data_dir, "output_*.pb"))
+    for output_file in output_files:
+        tensor = onnx.load_tensor(output_file)  # type: ignore[attr-defined]
+        outputs[tensor.name] = numpy_helper.to_array(tensor)
+
+    return model_bytes, inputs, outputs
+
+
+def export_data(data, value_info_proto, f: str) -> None:
+    """Export data to ONNX protobuf format.
+
+    Args:
+        data: The data to export, nested data structure of numpy.ndarray.
+        value_info_proto: The ValueInfoProto of the data. The type of the ValueInfoProto
+            determines how the data is stored.
+        f: The file to write the data to.
+    """
+    try:
+        from onnx import numpy_helper  # type: ignore[attr-defined]
+    except ImportError as exc:
+        raise ImportError(
+            "Export data to ONNX format failed: Please install ONNX."
+        ) from exc
+
+    with open(f, "wb") as opened_file:
+        if value_info_proto.type.HasField("map_type"):
+            opened_file.write(
+                numpy_helper.from_dict(data, value_info_proto.name).SerializeToString()
+            )
+        elif value_info_proto.type.HasField("sequence_type"):
+            opened_file.write(
+                numpy_helper.from_list(data, value_info_proto.name).SerializeToString()
+            )
+        elif value_info_proto.type.HasField("optional_type"):
+            opened_file.write(
+                numpy_helper.from_optional(
+                    data, value_info_proto.name
+                ).SerializeToString()
+            )
+        else:
+            assert value_info_proto.type.HasField("tensor_type")
+            opened_file.write(
+                numpy_helper.from_array(data, value_info_proto.name).SerializeToString()
+            )
+
+
+def _export_file(
+    model_bytes: bytes,
+    f: io.BytesIO | str,
+    export_type: str,
+    export_map: Mapping[str, bytes],
+) -> None:
+    """export/write model bytes into directory/protobuf/zip"""
+    if export_type == _exporter_states.ExportTypes.PROTOBUF_FILE:
+        assert len(export_map) == 0
+        with torch.serialization._open_file_like(f, "wb") as opened_file:
+            opened_file.write(model_bytes)
+    elif export_type in {
+        _exporter_states.ExportTypes.ZIP_ARCHIVE,
+        _exporter_states.ExportTypes.COMPRESSED_ZIP_ARCHIVE,
+    }:
+        compression = (
+            zipfile.ZIP_DEFLATED
+            if export_type == _exporter_states.ExportTypes.COMPRESSED_ZIP_ARCHIVE
+            else zipfile.ZIP_STORED
+        )
+        with zipfile.ZipFile(f, "w", compression=compression) as z:
+            z.writestr(_constants.ONNX_ARCHIVE_MODEL_PROTO_NAME, model_bytes)
+            for k, v in export_map.items():
+                z.writestr(k, v)
+    elif export_type == _exporter_states.ExportTypes.DIRECTORY:
+        if isinstance(f, io.BytesIO) or not os.path.isdir(f):  # type: ignore[arg-type]
+            raise ValueError(
+                f"f should be directory when export_type is set to DIRECTORY, instead get type(f): {type(f)}"
+            )
+        if not os.path.exists(f):  # type: ignore[arg-type]
+            os.makedirs(f)  # type: ignore[arg-type]
+
+        model_proto_file = os.path.join(f, _constants.ONNX_ARCHIVE_MODEL_PROTO_NAME)  # type: ignore[arg-type]
+        with torch.serialization._open_file_like(model_proto_file, "wb") as opened_file:
+            opened_file.write(model_bytes)
+
+        for k, v in export_map.items():
+            weight_proto_file = os.path.join(f, k)  # type: ignore[arg-type]
+            with torch.serialization._open_file_like(
+                weight_proto_file, "wb"
+            ) as opened_file:
+                opened_file.write(v)
+    else:
+        raise ValueError("Unknown export type")
+
+
+def _add_onnxscript_fn(
+    model_bytes: bytes,
+    custom_opsets: Mapping[str, int],
+) -> bytes:
+    """Insert model-included custom onnx-script function into ModelProto"""
+    try:
+        import onnx
+    except ImportError as e:
+        raise errors.OnnxExporterError("Module onnx is not installed!") from e
+
+    # For > 2GB model, onnx.load_fromstring would fail. However, because
+    # in _export_onnx, the tensors should be saved separately if the proto
+    # size > 2GB, and if it for some reason did not, the model would fail on
+    # serialization anyway in terms of the protobuf limitation. So we don't
+    # need to worry about > 2GB model getting here.
+    model_proto = onnx.load_model_from_string(model_bytes)  # type: ignore[attr-defined]
+
+    # Iterate graph nodes to insert only the included custom
+    # function_proto into model_proto
+    onnx_function_list = []  # type: ignore[var-annotated]
+    included_node_func: set[str] = set()
+    # onnx_function_list and included_node_func are expanded in-place
+    _find_onnxscript_op(
+        model_proto.graph, included_node_func, custom_opsets, onnx_function_list
+    )
+
+    if onnx_function_list:
+        model_proto.functions.extend(onnx_function_list)
+        model_bytes = model_proto.SerializeToString()
+    return model_bytes
+
+
+def _find_onnxscript_op(
+    graph_proto,
+    included_node_func: set[str],
+    custom_opsets: Mapping[str, int],
+    onnx_function_list: list,
+):
+    """Recursively iterate ModelProto to find ONNXFunction op as it may contain control flow Op."""
+    for node in graph_proto.node:
+        node_kind = node.domain + "::" + node.op_type
+        # Recursive needed for control flow nodes: IF/Loop which has inner graph_proto
+        for attr in node.attribute:
+            if attr.g is not None:
+                _find_onnxscript_op(
+                    attr.g, included_node_func, custom_opsets, onnx_function_list
+                )
+        # Only custom Op with ONNX function and aten with symbolic_fn should be found in registry
+        onnx_function_group = registration.registry.get_function_group(node_kind)
+        # Ruled out corner cases: onnx/prim in registry
+        if (
+            node.domain
+            and not jit_utils.is_aten(node.domain)
+            and not jit_utils.is_prim(node.domain)
+            and not jit_utils.is_onnx(node.domain)
+            and onnx_function_group is not None
+            and node_kind not in included_node_func
+        ):
+            specified_version = custom_opsets.get(node.domain, 1)
+            onnx_fn = onnx_function_group.get(specified_version)
+            if onnx_fn is not None:
+                if hasattr(onnx_fn, "to_function_proto"):
+                    onnx_function_proto = onnx_fn.to_function_proto()  # type: ignore[attr-defined]
+                    onnx_function_list.append(onnx_function_proto)
+                    included_node_func.add(node_kind)
+                continue
+
+            raise errors.UnsupportedOperatorError(
+                node_kind,
+                specified_version,
+                onnx_function_group.get_min_supported()
+                if onnx_function_group
+                else None,
+            )
+    return onnx_function_list, included_node_func
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/onnxruntime.py b/.venv/Lib/site-packages/torch/onnx/_internal/onnxruntime.py
new file mode 100644
index 0000000000000000000000000000000000000000..a22328d2685417b7d8d34ecd2dfd6196aca4a8d9
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/onnxruntime.py
@@ -0,0 +1,1260 @@
+# mypy: allow-untyped-defs
+import dataclasses
+import importlib
+import logging
+import os
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Final,
+    List,
+    Mapping,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TYPE_CHECKING,
+    Union,
+)
+from typing_extensions import TypeAlias
+
+import torch
+import torch._C
+import torch._ops
+import torch._prims.executor
+import torch.fx
+from torch._subclasses.fake_tensor import FakeTensor
+from torch.fx._compatibility import compatibility
+from torch.fx.passes.fake_tensor_prop import FakeTensorProp
+from torch.fx.passes.operator_support import OperatorSupport
+from torch.fx.passes.tools_common import CALLABLE_NODE_OPS
+from torch.utils import _pytree
+
+
+if TYPE_CHECKING:
+    import onnx
+    import onnxruntime
+    from onnxruntime.capi import _pybind_state as ORTC
+
+    import torch.onnx
+    import torch.onnx._internal
+    import torch.onnx._internal._exporter_legacy
+    import torch.onnx._internal.diagnostics
+    import torch.onnx._internal.fx.decomposition_table
+    import torch.onnx._internal.fx.passes  # noqa: TCH004
+
+
+_SUPPORT_ONNXRT: Optional[bool] = None
+
+__all__ = [
+    "is_onnxrt_backend_supported",
+    "torch_compile_backend",
+    "OrtExecutionProvider",
+    "OrtBackendOptions",
+    "OrtBackend",
+]
+
+
+def is_onnxrt_backend_supported() -> bool:
+    """Returns ``True`` if ONNX Runtime dependencies are installed and usable
+    to support TorchDynamo backend integration; ``False`` otherwise.
+
+    Example::
+
+        # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> import torch
+        >>> if torch.onnx.is_onnxrt_backend_supported():
+        ...     @torch.compile(backend="onnxrt")
+        ...     def f(x):
+        ...             return x * x
+        ...     print(f(torch.randn(10)))
+        ... else:
+        ...     print("pip install onnx onnxscript onnxruntime")
+        ...
+    """
+    global _SUPPORT_ONNXRT
+
+    if _SUPPORT_ONNXRT is None:
+        # `onnxruntime` might import a lot of other runtime packages,
+        # e.g. apex, deepspeed, transformers.
+        # So lazy-importing onnxruntime to avoid possible circular import.
+        try:
+            importlib.import_module("onnxruntime")
+            importlib.import_module("onnxruntime.capi._pybind_state")
+
+            # This is not use directly in DORT but needed by underlying exporter,
+            # so we still need to check if it exists.
+            importlib.import_module("onnxscript")
+
+            import torch.onnx  # noqa: F401
+            import torch.onnx._internal  # noqa: F401
+            import torch.onnx._internal._exporter_legacy  # noqa: F401
+            import torch.onnx._internal.diagnostics  # noqa: F401
+            from torch.onnx._internal.fx import (  # noqa: F401
+                decomposition_table,
+                fx_onnx_interpreter,
+                passes,
+                type_utils,
+            )
+
+            _SUPPORT_ONNXRT = True
+        except ImportError:
+            _SUPPORT_ONNXRT = False
+
+    return _SUPPORT_ONNXRT
+
+
+_dumped_onnx_model: Dict[str, int] = {}
+
+
+def _dump_onnx_model(
+    model_string: bytes, graph_module: Optional[torch.fx.GraphModule] = None
+) -> str:
+    """Stores the onnx model into a file.
+    The name is "{ONNXRT_DUMP_PATH}{N}.onnx"
+    where *N* is the number of files already stored with
+    this prefix.
+    If graph_module is not None, the graph is stored as a string with
+    the same filename except the extension (.txt).
+    """
+    prefix = os.environ.get("ONNXRT_DUMP_PATH", None)
+    if not prefix:
+        return ""
+    n = _dumped_onnx_model.get(prefix, -1) + 1
+    filename = f"{prefix}{n}.onnx"
+    with open(filename, "wb") as f:
+        f.write(model_string)
+    _dumped_onnx_model[prefix] = n
+    if graph_module is not None:
+        filename_txt = f"{prefix}{n}.txt"
+        with open(filename_txt, "w", encoding="utf-8") as f:
+            f.write(str(graph_module.graph))
+    return filename
+
+
+def _infer_default_eps() -> Sequence[str]:
+    # TODO: select a good default based on the capabilities of the host
+    # e.g. DML on Windows, etc.
+    return ["CPUExecutionProvider"]
+
+
+def _nvtx_range_push(name: str):
+    """If PyTorch is installed with CUDA support, this starts NVTX range.
+
+    Check torch.cuda.nvtx.range_push's document for more details.
+    """
+    if torch.cuda.is_available():
+        torch.cuda.nvtx.range_push(name)
+
+
+def _nvtx_range_pop():
+    """If PyTorch is installed with CUDA support, this terminates NVTX range.
+
+    Check torch.cuda.nvtx.range_pop's document for more details.
+    """
+    if torch.cuda.is_available():
+        torch.cuda.nvtx.range_pop()
+
+
+def _get_ort_device_type(device_type: str):
+    from onnxruntime.capi import _pybind_state as ORTC
+
+    if device_type == "cuda":
+        return ORTC.OrtDevice.cuda()
+    if device_type == "cpu":
+        return ORTC.OrtDevice.cpu()
+    # ort pytorch device is mapped to NPU OrtDevice type
+    if device_type == "maia":
+        return ORTC.OrtDevice.npu()
+    raise ValueError("Unsupported device type: " + device_type)
+
+
+logger = logging.getLogger(__name__)
+# Uncomment the following lines to print out development info.
+# logging.basicConfig(level=logging.WARNING)
+# logger.setLevel(logging.WARNING)
+
+
+class OrtOperatorSupport(OperatorSupport):
+    """Operator support for ONNXRuntime backend.
+
+    It has two-level of support decision. One is via support_dict and the other one
+    is via extra_support_dict. The logic of using support_dict is implemented in
+    OrtOperatorSupport and extra_support_dict is used by OperatorSupport.is_node_supported.
+    """
+
+    def __init__(self, support_dict: Set[Any], extra_support_dict: Dict[str, Any]):
+        # Use extra_support_dict[op_name] = None to indicate
+        # we support op_name with all input types. Otherwise,
+        # see support_dict (type: SupportDict) in operator_support.py
+        # for specifying supported types.
+        super().__init__(extra_support_dict)
+        self._onnx_support_dict = support_dict
+
+    def is_node_supported(
+        self, submodules: Mapping[str, torch.nn.Module], node: torch.fx.Node
+    ) -> bool:
+        # OperatorSupport.is_node_supported returns True for non-callable nodes.
+        # Since ORT can't execute them, we return False here to override the base
+        # behavior.
+        if node.op not in CALLABLE_NODE_OPS:
+            return False
+        # This is the and the only place to decide if aten op is supported.
+        if node.op == "call_function" and node.target in self._onnx_support_dict:
+            logger.info(
+                "support_dict supports node.target: %s (type: %s)",
+                node.target,
+                type(node.target),
+            )
+            return True
+        # If node.target is not in support_dict, we still want to check if torch.jit.script
+        # can convert it to ONNX equivalence. Let's use base mechanism to do this.
+        # See extra_support_dict  for supported ops.
+        if super().is_node_supported(submodules, node):
+            logger.info(
+                "extra_support_dict supports node.target: %s (type: %s)",
+                node.target,
+                type(node.target),
+            )
+            return True
+        logger.warning(
+            "support_dict and extra_support_dict don't support node.target: %s (type: %s)",
+            node.target,
+            type(node.target),
+        )
+        return False
+
+
+def _move_placeholder_to_front(graph_module: torch.fx.GraphModule) -> None:
+    """
+    In torch.fx.Graph, placeholder is a special assignment node. If it's not
+    executed in the beginning, it could overwrite values computed by upstream
+    nodes.
+    """
+
+    graph = graph_module.graph
+    placeholders = []
+    first_not_placeholder = None
+    for node in graph.nodes:
+        if node.op == "placeholder":
+            placeholders.append(node)
+        if first_not_placeholder is None and node.op != "placeholder":
+            first_not_placeholder = node
+    if first_not_placeholder is None:
+        return
+    for placeholder in placeholders:
+        first_not_placeholder.prepend(placeholder)
+
+
+def _infer_ep_from_device(*args) -> Tuple[str, ...]:
+    """Return the first valid device (i.e., GPU or CPU) in argument list."""
+    eps = []
+    for arg in args:
+        if hasattr(arg, "device"):
+            device = arg.device
+            if device.type == "cuda":
+                eps.append("CUDAExecutionProvider")
+            elif device.type == "cpu":
+                eps.append("CPUExecutionProvider")
+    return tuple(eps)
+
+
+def _extract_graph_module_inputs(graph_module: torch.fx.GraphModule) -> Tuple[Any, ...]:
+    placeholders = []
+    for node in graph_module.graph.nodes:
+        if node.op == "placeholder":
+            if hasattr(node, "meta") and "val" in node.meta:
+                assert isinstance(node.meta["val"], torch.Tensor)
+            placeholders.append(node)
+    return tuple(placeholders)
+
+
+def _extract_graph_module_outputs(graph_module: torch.fx.GraphModule) -> Any:
+    """Collect "val" fields from outputs metadata in this torch.fx.GraphModule."""
+    for node in graph_module.graph.nodes:
+        if node.op == "output":
+            # Output node is unique. Let's retrieve output values from
+            # this node's input list. And then just return.
+            return node.args[0]
+    raise ValueError("No output node found in this torch.fx.GraphModule.")
+
+
+def _infer_ep_from_graph_module(graph_module: torch.fx.GraphModule) -> Tuple[str, ...]:
+    """Return the all valid devices (i.e., GPU or CPU) among outputs of this torch.fx.GraphModule."""
+    flattened_output_args, _ = _pytree.tree_flatten(
+        _extract_graph_module_outputs(graph_module)
+    )
+    # Output arguments with example value (type: torch.Tensor) in the `graph_module`.
+    selected_output_args = [
+        output_arg.meta["val"]
+        for output_arg in flattened_output_args
+        # output_arg must have tensor for its device information.
+        # Otherwise, skip it.
+        if (hasattr(output_arg, "meta") and "val" in output_arg.meta)
+    ]
+    return _infer_ep_from_device(*selected_output_args)
+
+
+def _sort_eps(eps: Tuple[str, ...]) -> Tuple[str, ...]:
+    """Sort execution providers in eps based on pre-set priority."""
+
+    def get_execution_provider_priority(ep: str) -> int:
+        if ep == "CPUExecutionProvider":
+            # Lowest priority.
+            return 2
+        if ep == "CUDAExecutionProvider":
+            # Higher priority than CPU but lower than
+            # other specialized EPs.
+            return 1
+        # Highest priority.
+        return 0
+
+    unique_eps = set(eps)
+    return tuple(sorted(unique_eps, key=get_execution_provider_priority, reverse=True))
+
+
+def _get_onnx_devices(
+    values: Tuple[
+        Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+        ...,
+    ],
+) -> Tuple["ORTC.OrtDevice", ...]:
+    from onnxruntime.capi import _pybind_state as ORTC
+
+    def _device_id_or_zero(device_id: int) -> int:
+        return device_id or 0
+
+    def _map_tensor_or_sym_to_device(
+        value: Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+    ) -> int:
+        if isinstance(value, torch.Tensor):
+            return ORTC.OrtDevice(
+                _get_ort_device_type(value.device.type),
+                ORTC.OrtDevice.default_memory(),
+                _device_id_or_zero(value.device.index),
+            )
+        elif isinstance(
+            value, (torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool)
+        ):
+            return ORTC.OrtDevice(
+                _get_ort_device_type("cpu"), ORTC.OrtDevice.default_memory(), 0
+            )
+        else:
+            raise ValueError("Unsupported value type: " + str(type(value)))
+
+    if len(values) > 0:
+        ort_devices = tuple(_map_tensor_or_sym_to_device(value) for value in values)
+        return ort_devices
+    else:
+        return (_map_tensor_or_sym_to_device(1),)
+
+
+def _get_ortvalues_from_torch_tensors(
+    tensors: Tuple[torch.Tensor, ...], devices: Tuple["ORTC.OrtDevice", ...]
+) -> Tuple[torch.Tensor, ...]:
+    from onnxruntime.capi import _pybind_state as ORTC
+
+    from torch.onnx._internal.fx.type_utils import _TORCH_DTYPE_TO_NUMPY_DTYPE
+
+    ortvalues = ORTC.OrtValueVector()
+    ortvalues.reserve(len(tensors))
+    dtypes = []
+    shapes = []
+    data_ptrs = []
+
+    for tensor in tensors:
+        dtypes.append(_TORCH_DTYPE_TO_NUMPY_DTYPE[tensor.dtype])
+        shapes.append(tensor.size())
+        data_ptrs.append(tensor.data_ptr())
+    ortvalues.push_back_batch(tensors, data_ptrs, dtypes, shapes, devices)
+    return ortvalues
+
+
+def _to_real_tensor(tensor: FakeTensor) -> torch.Tensor:
+    if tensor.is_sparse:
+        raise ValueError("sparse tensor is not yet supported.")
+    out = torch.empty(tensor.size(), dtype=tensor.dtype, device=tensor.device)
+    return out
+
+
+def _adjust_scalar_from_fx_to_onnx(
+    dynamo_value: Union[
+        torch.Tensor,
+        int,
+        float,
+        bool,
+    ],
+    value_info: "onnx.ValueInfoProto",  # type: ignore[name-defined]
+) -> torch.Tensor:
+    """Helper function to wrap PyTorch variables as torch.Tensor"""
+    if (
+        isinstance(dynamo_value, torch.Tensor)
+        and len(value_info.type.tensor_type.shape.dim) == 0
+        and dynamo_value.shape == (1,)
+    ):
+        # ONNX expect a scalar with empty shape.
+        # In contrast, PyTorch usually allows implicit
+        # conversion between shape=() and shape=(1,).
+        #
+        # Below, PyTorch's shape (1,) is reshaped to ().
+        return torch.squeeze(dynamo_value)
+    elif isinstance(dynamo_value, int):
+        return torch.tensor(dynamo_value, dtype=torch.int64)
+    elif isinstance(dynamo_value, float):
+        return torch.tensor(dynamo_value, dtype=torch.float32)
+    elif isinstance(dynamo_value, bool):
+        return torch.tensor(dynamo_value, dtype=torch.bool)
+    else:
+        assert isinstance(dynamo_value, torch.Tensor)
+        return dynamo_value.contiguous()
+
+
+def _adjust_scalar_from_onnx_to_fx(
+    tensor: torch.Tensor,
+    prim_value: Union[
+        torch.Tensor,
+        torch.SymInt,
+        int,
+        torch.SymFloat,
+        float,
+        torch.SymBool,
+        bool,
+    ],
+) -> Union[
+    torch.Tensor,
+    int,
+    float,
+    bool,
+]:
+    """Helper function to wrap ORT-produced torch.Tensor as PyTorch variables"""
+    assert isinstance(tensor, torch.Tensor), "ORT's output must be tensor."
+    if isinstance(
+        prim_value,
+        (torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool),
+    ):
+        # Convert tensor back to scalar to match Dynamo's expectation.
+        return tensor.item()
+    return tensor
+
+
+def _run_onnx_session_with_ortvaluevector(
+    sess: "onnxruntime.InferenceSession",
+    input_names: Tuple[str, ...],
+    inputs: Tuple[torch.Tensor, ...],
+    input_devices: Tuple["ORTC.OrtDevice", ...],
+    output_names: Tuple[str, ...],
+    outputs: Tuple[torch.Tensor, ...],
+    output_devices: Tuple["ORTC.OrtDevice", ...],
+    preallocate_output: bool,
+    input_value_infos: Tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+    normalized_prim_outputs: Tuple[
+        Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+        ...,
+    ],
+) -> Tuple[Union[torch.Tensor, int, float, bool], ...]:
+    import onnxruntime
+    from onnxruntime.capi import _pybind_state as ORTC
+
+    _nvtx_range_push("contiguous")
+    inputs = tuple(
+        _adjust_scalar_from_fx_to_onnx(arg, value_info)
+        for arg, value_info in zip(inputs, input_value_infos)
+    )
+    _nvtx_range_pop()
+
+    _nvtx_range_push("push_back_batch")
+    ort_inputs = _get_ortvalues_from_torch_tensors(inputs, input_devices)
+
+    # preallocate output pytorch Tensors and use the buffers affined to the torch device for the output ortvalue.
+    # Because the output ortvalue is not allocated and owned by ort, it does not need to convert the output ortvalue
+    # to torch Tensor transferring the ownership.
+    if preallocate_output:
+        pth_outputs = tuple(
+            _to_real_tensor(t) if isinstance(t, FakeTensor) else t for t in outputs
+        )
+        ort_outputs = _get_ortvalues_from_torch_tensors(pth_outputs, output_devices)
+    else:
+        ort_outputs = ORTC.OrtValueVector()
+    _nvtx_range_pop()
+
+    _nvtx_range_push("run_with_ortvaluevector")
+    run_options = onnxruntime.RunOptions()
+    run_options.add_run_config_entry("disable_synchronize_execution_providers", "1")
+    sess.run_with_ortvaluevector(
+        run_options, input_names, ort_inputs, output_names, ort_outputs, output_devices
+    )
+    _nvtx_range_pop()
+
+    # Post-processing step:
+    #  wrap ORT's outputs to the schema represented by
+    #  `prim_output` (obtained by running the original
+    #  torch.fx.GraphModule).
+    if preallocate_output:
+        # Profile the ORT-to-PyTorch type cast below
+        _nvtx_range_push("after run_with_ortvaluevector")
+        # Outputs are stored on pre-allocated torch.Tensors' memory,
+        # so this case doesn't need to convert ORTValue to torch.Tensor.
+        pth_outputs = tuple(
+            _adjust_scalar_from_onnx_to_fx(onnx_output, prim_output)  # type: ignore[misc]
+            for onnx_output, prim_output in zip(pth_outputs, normalized_prim_outputs)
+        )
+        _nvtx_range_pop()
+        return pth_outputs
+    else:
+        # Profile the two ORT-to-PyTorch type casts below
+        _nvtx_range_push("after run_with_ortvaluevector")
+        # Map ORTValue to torch.Tensor.
+        pth_outputs = onnxruntime.training.ortmodule._utils._ortvalues_to_torch_tensor(
+            ort_outputs
+        )
+        # Change some torch.Tensor to int, float, bool.
+        pth_outputs = tuple(
+            _adjust_scalar_from_onnx_to_fx(onnx_output, prim_output)  # type: ignore[misc]
+            for onnx_output, prim_output in zip(pth_outputs, normalized_prim_outputs)
+        )
+        _nvtx_range_pop()
+        return pth_outputs
+
+
+def _run_onnx_session_with_fetch(
+    sess: "onnxruntime.InferenceSession",
+    input_names: Tuple[str, ...],
+    inputs: Tuple[torch.Tensor, ...],
+    input_devices: Tuple["ORTC.OrtDevice", ...],
+    output_names: Tuple[str, ...],
+    outputs: Tuple[torch.Tensor, ...],
+    output_devices: Tuple["ORTC.OrtDevice", ...],
+    preallocate_output: bool,
+    input_value_infos: Tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+    normalized_prim_outputs: Tuple[
+        Union[
+            torch.Tensor, torch.SymInt, int, torch.SymFloat, float, torch.SymBool, bool
+        ],
+        ...,
+    ],
+) -> Tuple[Union[torch.Tensor, int, float, bool], ...]:
+    import onnxruntime
+
+    inputs = tuple(
+        _adjust_scalar_from_fx_to_onnx(arg, value_info)
+        for arg, value_info in zip(inputs, input_value_infos)
+    )
+    feed = {
+        name: onnxruntime.OrtValue.ortvalue_from_numpy(tensor.cpu().numpy())
+        for name, tensor in zip(input_names, inputs)
+    }
+    ort_outputs = sess.run(output_names, feed)
+    pth_outputs = tuple(
+        _adjust_scalar_from_onnx_to_fx(
+            torch.from_numpy(value),
+            prim_output,
+        )
+        for value, prim_output in zip(ort_outputs, normalized_prim_outputs)
+    )
+    return pth_outputs
+
+
+class OrtExecutionInfoPerSession:
+    """Information required to execute torch.fx.GraphModule using onnxruntime.InferenceSession"""
+
+    def __init__(
+        self,
+        session: "onnxruntime.InferenceSession",
+        input_names: Tuple[str, ...],
+        input_value_infos: Tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+        output_names: Tuple[str, ...],
+        output_value_infos: Tuple["onnx.ValueInfoProto", ...],  # type: ignore[name-defined]
+        input_devices: Tuple["ORTC.OrtDevice", ...],
+        output_devices: Tuple["ORTC.OrtDevice", ...],
+        example_outputs: Union[Tuple[torch.Tensor, ...], torch.Tensor],
+    ):
+        # Carrier of ONNX model and its executor.
+        self.session: onnxruntime.InferenceSession = session
+        # For the ONNX model stored in self.session, self.input_names[i] is the
+        # name of the i-th positional input.
+        self.input_names: Tuple[str, ...] = input_names
+        # self.input_name[i]'s type information is stored in self.input_value_infos[i].
+        self.input_value_infos: Tuple[onnx.ValueInfoProto, ...] = input_value_infos  # type: ignore[name-defined]
+        # Similar to self.input_names, but for outputs.
+        self.output_names: Tuple[str, ...] = output_names
+        # Similar to self.input_value_infos but for outputs.
+        self.output_value_infos: Tuple[onnx.ValueInfoProto, ...] = output_value_infos  # type: ignore[name-defined]
+        # For the ONNX model stored in self.session, self.input_devices[i] is the
+        # i-th positional input's device.
+        self.input_devices: Tuple[ORTC.OrtDevice, ...] = input_devices
+        # Similar to self.input_devices, but for outputs.
+        self.output_devices: Tuple[ORTC.OrtDevice, ...] = output_devices
+        # This is the outputs of executing the original torch.fx.GraphModule with example inputs
+        # (i.e., args passed into OrtBackend._ort_acclerated_call).
+        self.example_outputs: Union[Tuple[torch.Tensor, ...], torch.Tensor] = (
+            example_outputs
+        )
+
+    def is_supported(self, *args):
+        from torch.onnx._internal.fx.type_utils import (
+            _TORCH_DTYPE_TO_ONNX_TENSOR_ELEMENT_TYPE,
+            from_python_type_to_onnx_tensor_element_type,
+        )
+
+        # Compare the args and the input schema in ONNX model and
+        # return the first match.
+        if len(args) != len(self.input_value_infos):
+            return False
+        for arg, value_info in zip(args, self.input_value_infos):
+            if not isinstance(arg, (torch.Tensor, float, int)):
+                return False
+
+            # Check Python scalars such as int, float, and bool.
+            if isinstance(arg, (int, float, bool)):
+                # Map, e.g., float to onnx.TensorProto.FLOAT.
+                onnx_dtype = from_python_type_to_onnx_tensor_element_type(type(arg))
+                if onnx_dtype != value_info.type.tensor_type.elem_type:
+                    return False
+                if len(value_info.type.tensor_type.shape.dim) != 0:
+                    return False
+                continue
+
+            # Check tensor.
+            onnx_dtype = _TORCH_DTYPE_TO_ONNX_TENSOR_ELEMENT_TYPE[arg.dtype]
+            if onnx_dtype != value_info.type.tensor_type.elem_type:
+                return False
+            for dim, onnx_dim in zip(arg.shape, value_info.type.tensor_type.shape.dim):
+                if isinstance(dim, int) and (
+                    onnx_dim.dim_value == dim or onnx_dim.dim_param
+                ):
+                    continue
+                elif isinstance(dim, torch.SymInt) and onnx_dim.dim_param:
+                    continue
+                else:
+                    return False
+        return True
+
+
+@dataclasses.dataclass
+class OrtExecutionInfoForAllGraphModules:
+    def __init__(self) -> None:
+        # All sessions (and their related information) created by exporting the same GraphModule
+        # with different inputs.
+        self.execution_info_per_graph_module: Dict[
+            torch.fx.GraphModule, List[OrtExecutionInfoPerSession]
+        ] = {}
+
+    def search_reusable_session_execution_info(
+        self, graph_module: torch.fx.GraphModule, *args
+    ):
+        if graph_module not in self.execution_info_per_graph_module:
+            return None
+        # All execution information for ONNX models exported from the same `graph_module`
+        # with different inputs.
+        candidates = self.execution_info_per_graph_module[graph_module]
+
+        for candidate in candidates:
+            if candidate.is_supported(*args):
+                # Returns the first session that accepts this input schema.
+                return candidate
+        # No reusable session found.
+        return None
+
+    def cache_session_execution_info(
+        self, graph_module: torch.fx.GraphModule, info: OrtExecutionInfoPerSession
+    ):
+        if graph_module not in self.execution_info_per_graph_module:
+            self.execution_info_per_graph_module[graph_module] = [info]
+        else:
+            self.execution_info_per_graph_module[graph_module].append(info)
+
+
+OrtExecutionProvider: TypeAlias = Union[str, Tuple[str, Mapping[str, Any]]]
+"""Either the name of an ONNX Runtime execution provider as a string or
+a 2-tuple of the name and a dictionary of execution provider options.
+
+Examples::
+
+    >>> "CPUExecutionProvider"
+
+    >>> ("CUDAExecutionProvider", {"device_id": 3})
+
+"""
+
+
+@dataclasses.dataclass(frozen=True)
+@compatibility(is_backward_compatible=False)
+class OrtBackendOptions:
+    """Options for constructing an ``OrtBackend``, the ONNX Runtime
+    backend (``"onnxrt"``) for ``torch.compile``.
+
+    Example::
+
+        >>> @torch.compile(
+        ...     backend="onnxrt",
+        ...     options=torch.onnx._OrtBackendOptions(...),
+        ... )
+        ... def ort_function(x):
+        ...     return x ** x
+    """
+
+    preferred_execution_providers: Optional[Sequence[OrtExecutionProvider]] = None
+    """An optional sequence of execution providers to be prioritized ahead of any
+    execution providers that may be inferred (see ``infer_execution_providers``).
+    """
+
+    infer_execution_providers: bool = True
+    """Whether to infer an execution provider from ``torch.device`` bound to inputs or found in the graph."""
+
+    default_execution_providers: Optional[Sequence[OrtExecutionProvider]] = None
+    """The default fallback execution providers. If not specified, one will be
+    be selected based on the host environment (most likely ``"CPUExecutionProvider"``).
+    """
+
+    # preallocate_output allows for allocating output torch Tensor buffers and feeding them to InferenceSession
+    # in order to avoid internal allocation of output buffers in InferenceSession.
+    # If output ortvalue returned from InferenceSession is allocated internally,
+    # it needs to be converted to torch Tensor for return, and the torch Tensor should hold the ownership.
+    # When a custom torch device is used with a custom aten allocator, the conversion from ortvalue to torch Tensor
+    # should be supported, which is currently done through dlpack. Note that dlpack might not support a custom torch device.
+    # It can be avoided by allowing for preallocation for output buffers allocated by a custom aten allocator,
+    # and use the preallocated output buffers for InferenceSession not holding any ownership for them.
+    # TODO(wschin): Make it to inference session level flag.
+    # See https://github.com/pytorch/pytorch/issues/106869.
+    preallocate_output: bool = False
+    """If ``True``, allocate memory for ONNX Runtime's outputs on the PyTorch side."""
+
+    use_aot_autograd: bool = True
+    """Whether to wrap the ``OrtBackend`` with TorchDynamo's aot_autograd backend
+    to support training (i.e., backward graphs are also sent to ``OrtBackend``).
+
+    Symbolic execution is used to capture the forward pass and backward passes as a single graph.
+    Then, a selected graph partition algorithm (``min_cut_rematerialization_partition``) is used
+    to split the entire graph into forward sub-graph and backward sub-graph. Finally, both
+    sub-graphs are compiled by ``OrtBackend``.
+    """
+
+    export_options: Optional["torch.onnx.ExportOptions"] = None
+    """Options for the TorchDynamo-based ONNX exporter used by the ``OrtBackend``."""
+
+    ort_session_options: Optional["onnxruntime.SessionOptions"] = None
+    """Options for the ``onnxruntime.InferenceSession`` used by the ``OrtBackend``."""
+
+    pre_ort_model_transforms: Optional[  # type: ignore[name-defined]
+        Sequence[Callable[["onnx.ModelProto"], None]]
+    ] = None
+    """A list of graph transforms to be applied to the ONNX model before it
+    is fed to ONNXRuntime's InferenceSession."""
+
+
+@compatibility(is_backward_compatible=False)
+class OrtBackend:
+    """A backend compiles (sub-)graphs in torch.fx.GraphModule to onnxruntime.InferenceSession calls.
+
+    The compiler entry point is OrtBackend.compile, which
+        1. partitions the original graph into supported sub-graphs (type: torch.fx.GraphModule) and unsupported
+           sub-graphs.
+        2. For each supported sub-graph, it replaces its _wrapped_call function with _ort_accelerated_call.
+        3. Inside _ort_accelerated_call, it creates onnxruntime.InferenceSession and calls it to execute the sub-graph.
+    """
+
+    def __init__(self, options: Optional[OrtBackendOptions] = None):
+        from onnxruntime.capi import _pybind_state as ORTC
+
+        import torch.onnx
+        import torch.onnx._internal._exporter_legacy
+        import torch.onnx._internal.fx.decomposition_table
+
+        self._options: Final = OrtBackendOptions() if options is None else options
+
+        # options.export_options contains information shared between exporter and DORT.
+        # For example, they should use the same decomposition table when
+        #  1. capturing FX graph in torch.compile (see how we create aot_ort in register_backend.py)
+        #  2. call exporter's API to convert `torch.fx.GraphModule` to ONNX model
+        #     (see onnxfunction_dispatcher passed to FxOnnxInterpreter.run below).
+        #
+        # Convert user-facing option to internal option used by ONNX exporter
+        # to access required information.
+        # Some useful fields:
+        # - Decomposition table for decomposing FX operators in exporter is
+        #   self._resolved_onnx_exporter_options.decomposition_table.
+        # - self._resolved_onnx_exporter_options.onnx_registry records what
+        #   aten/prim ops are supported by exporter and their exporters (type: callable).
+        self._resolved_onnx_exporter_options = (
+            torch.onnx._internal._exporter_legacy.ResolvedExportOptions(
+                torch.onnx.ExportOptions()
+                if self._options.export_options is None
+                else self._options.export_options
+            )
+        )
+
+        #  Given DORT's computation flow:
+        #   1. OrtOperatorSupport uses support_dict and extra_support_dict to select operators
+        #      and send them to DORT.
+        #   2. Then, DORT exports the selected sub-graphs into ONNX.
+        #   3. Finally DORT calls ORT to do the computation.
+        #  OrtOperatorSupport and create_onnx_friendly_decomposition_table(...)
+        #  must use the same support_dict. If the support_dict here contains something not
+        #  supported by exporter, exporter will fails in step 2 since the selected graphs may
+        #  contains unsupported operators such as aten::_who_you_are.
+        #  This restriction is automatically done since DORT and exporter shares the same
+        #  self._resolved_onnx_exporter_options.
+        support_dict = torch.onnx._internal.fx.decomposition_table._create_onnx_supports_op_overload_table(
+            self._resolved_onnx_exporter_options.onnx_registry
+        )
+
+        extra_support_dict: Dict[str, Any] = {
+            "getattr": None,
+            # To send operator.getitem to ORT, add the corresponding string
+            # recognized by PyTorch's OperatorSupport class.
+            "_operator.getitem": None,
+            # To send operator.mul to ORT, add the corresponding string
+            # recognized by PyTorch's OperatorSupport class.
+            "_operator.mul": None,
+            "_operator.add": None,
+            "_operator.sub": None,
+        }
+
+        self._supported_ops = OrtOperatorSupport(support_dict, extra_support_dict)
+        # TODO(wschin): this is a naive implementation of cache without proper guard
+        # See https://github.com/pytorch/pytorch/issues/106868.
+        self._partitioner_cache: Dict[torch.fx.GraphModule, torch.fx.GraphModule] = {}
+        # Conceptually, this filed is a 2-layer dictionary
+        #   GraphModule 0
+        #     ONNX Model 0 (with ORT InferenceSession and related information. type: OrtExecutionInfoPerSession)
+        #     ONNX Model 1
+        #     ...
+        #   GraphModule 1
+        #     ONNX Model 2 (with ORT InferenceSession and related information. type: OrtExecutionInfoPerSession)
+        #     ONNX Model 3
+        #     ...
+        #   ...
+        # , which caches all previous compilation result so that we can reuse them.
+        # ONNX Model 0 and 1 are exported from the same GraphModule 0 but with different inputs
+        # (e.g., tensors with different ranks). GraphModule 0 and GraphModule 1 are different
+        # graphs captured by Dynamo and sent to OrtBackend.compile.
+        self._all_ort_execution_info = OrtExecutionInfoForAllGraphModules()
+
+        self._assert_allclose_to_baseline = False
+
+        self.execution_count = 0
+
+        # Function which invokes ORT do to the real computation.
+        self.run = (
+            _run_onnx_session_with_ortvaluevector
+            if hasattr(ORTC.OrtValueVector, "push_back_batch")
+            else _run_onnx_session_with_fetch
+        )
+
+    def _select_eps(
+        self, graph_module: torch.fx.GraphModule, *args
+    ) -> Sequence[Tuple[str, Mapping[str, Any]]]:
+        inferred_eps: Tuple[str, ...] = ()
+        if self._options.infer_execution_providers:
+            if eps_from_args := _infer_ep_from_device(*args):
+                # If user feeds CUDA tensor as input argument,
+                # we want to use CUDA EP.
+                # Thus, `eps_from_args` (deduced from input arguments)
+                # has highest priority.
+                inferred_eps = eps_from_args
+            elif eps_from_graph_module := _infer_ep_from_graph_module(graph_module):
+                # If there is no EP in input arguments, we deduce EP from
+                # graph_module's outputs. Those outputs may come from
+                # FakeTensorProp or Dynamo's built-in symbolic shape inference.
+                inferred_eps = eps_from_graph_module
+
+        selected_eps = []
+
+        for ep in (
+            *(self._options.preferred_execution_providers or []),
+            *_sort_eps(inferred_eps),
+            *(self._options.default_execution_providers or _infer_default_eps()),
+        ):
+            if isinstance(ep, str):
+                ep = (ep, {})
+            elif isinstance(ep, tuple) and ep[1] is None:
+                ep = (ep[0], {})
+            if ep is not None and ep not in selected_eps:
+                selected_eps.append(ep)
+
+        return selected_eps
+
+    def _ort_acclerated_call(self, graph_module: torch.fx.GraphModule, *args, **kwargs):
+        """This function replaces GraphModule._wrapped_call in compiled model.
+
+        The _wrapped_call is the underlying implementation of forward method. Replacing
+        it means we delegate the computation to _ort_acclerated_call and therefore
+        onnxruntime.InferenceSession.
+        """
+        import onnxruntime
+
+        from torch.onnx._internal.fx import fx_onnx_interpreter, passes
+
+        cached_execution_info_per_session = (
+            self._all_ort_execution_info.search_reusable_session_execution_info(
+                graph_module, *args
+            )
+        )
+        if cached_execution_info_per_session:
+            onnx_session = cached_execution_info_per_session.session
+            input_names = cached_execution_info_per_session.input_names
+            output_names = cached_execution_info_per_session.output_names
+            input_value_infos = cached_execution_info_per_session.input_value_infos
+            output_value_infos = cached_execution_info_per_session.output_value_infos
+            input_devices = cached_execution_info_per_session.input_devices
+            output_devices = cached_execution_info_per_session.output_devices
+            prim_outputs = cached_execution_info_per_session.example_outputs
+        else:
+            # It's first time seeing such as graph. Let's make a new session
+            # (type: onnxruntime.InferenceSession) for it.
+
+            graph_module = passes.MovePlaceholderToFront(
+                self._resolved_onnx_exporter_options.diagnostic_context,
+                graph_module,
+            ).run()
+            # Generate reference outputs. They are used to indicate output
+            # tensors' types and devices when calling ORT.
+            #
+            # WARNING: The downstream code should not change prim_outputs and
+            # this backend should always produces output with schema identical to prim_outputs'.
+
+            if self._resolved_onnx_exporter_options.dynamic_shapes:
+                # No pre-allocation when dynamic shape is enabled.
+                self.preallocate_output = False
+                extracted_outputs = _extract_graph_module_outputs(graph_module)
+
+                def maybe_map_to_meta_val(value):
+                    if hasattr(value, "meta") and "val" in value.meta:
+                        # Select outputs with "val" information. Without "val",
+                        # it's not possible access output_arg.meta["val"].device.
+                        return value.meta["val"]
+                    else:
+                        return value
+
+                prim_outputs = _pytree.tree_map(
+                    maybe_map_to_meta_val, extracted_outputs
+                )
+            else:
+                try:
+                    prim_outputs = FakeTensorProp(graph_module).propagate(
+                        *args, **kwargs
+                    )
+                except Exception:
+                    logger.warning("FakeTensorProb failed for %s", graph_module)
+                    # When FakeTensorProp fails, it is not possible to preallocate output buffers
+                    # because the output shapes are not inferred.
+                    self.preallocate_output = False
+
+                    # rethrow FakeTensorProb failure because it is not yet currently handled.
+                    raise
+
+            # Create the object to iterate through the nodes in graph one-by-one
+            # and calls the corresponding ONNX exporter for each node.
+            fx_interpreter = fx_onnx_interpreter.FxOnnxInterpreter(
+                diagnostic_context=self._resolved_onnx_exporter_options.diagnostic_context
+            )
+            # Cast FX variables if they will result schema-mismatch when searching
+            # for ONNX operator. E.g., add(double_tensor, int_tensor) is fine in PyTorch,
+            # but ONNX expects add(double_tensor, double_tensor).
+            graph_module = passes.InsertTypePromotion(
+                self._resolved_onnx_exporter_options.diagnostic_context, graph_module
+            ).run()
+            # Start the per-node exporting process. It's conceptually a for loop
+            # scanning through the nodes in the graph.
+            exported = fx_interpreter.run(
+                fx_graph_module=graph_module,
+                onnxfunction_dispatcher=self._resolved_onnx_exporter_options.onnxfunction_dispatcher,
+            )
+            # Convert the exported result to ONNX ModelProto.
+            onnx_model = exported.to_model_proto(
+                opset_version=self._resolved_onnx_exporter_options.onnx_registry.opset_version,
+            )
+
+            try:
+                from onnxscript import optimizer  # type: ignore[import]
+                from onnxscript.rewriter import (  # type: ignore[import]
+                    onnxruntime as ort_rewriter,
+                )
+
+                onnx_model = optimizer.optimize(onnx_model)
+                onnx_model = ort_rewriter.rewrite(onnx_model)
+            except ImportError:
+                logger.warning(
+                    "ONNXScript optimizer is not available. Skipping optimization. "
+                    "Please `pip install onnxscript -U` to enable post-export optimization."
+                )
+
+            # Modify ONNX model using pre-registered graph transforms.
+            # They are in-place modifications for avoiding unnecessary
+            # copy of ONNX initializers.
+            if self._options.pre_ort_model_transforms:
+                for transform in self._options.pre_ort_model_transforms:
+                    transform(onnx_model)
+
+            onnx_model_bytes = onnx_model.SerializeToString()
+            if os.environ.get("ONNXRT_DUMP_PATH", None):
+                # If not empty, environment variable ONNXRT_DUMP_PATH defined the path
+                # where generated onnx files should be stored.
+                # This module keeps a global variables keeping track of the
+                # stored models.
+                # If ONNXRT_DUMP_PATH="dumped/dumped_model_"
+                # The first file name will be 'dumped/dumped_model_0.onnx'.
+                # For every dumped model, a text file 'dumped/dumped_model_0.txt'
+                # is created as well to contain the string representing the graph_module.
+                _dump_onnx_model(onnx_model_bytes, graph_module=graph_module)
+
+            # Initialize a ORT session to execute this ONNX model.
+            # Note that TorchDynamo assumes all inputs/outputs are on the
+            # same device, but it's subject to change (very likely with
+            # dynamic shape support), so we add execution providers
+            # based on the logic in _select_eps: (explicitly preferred EPs,
+            # EPs inferred from inputs or graph, and the fallback default EP)/
+            #
+            # TODO(wschin): enable external allocators.
+            # See https://github.com/pytorch/pytorch/issues/106867
+            onnx_session = onnxruntime.InferenceSession(
+                path_or_bytes=onnx_model_bytes,
+                sess_options=self._options.ort_session_options,
+                providers=self._select_eps(graph_module, *args),
+            )
+
+            # Cache ORT session. It's reused for the same "graph_module".
+            # Generate ONNX model and extract its input and output names.
+            input_names = tuple(input.name for input in onnx_model.graph.input)
+            output_names = tuple(output.name for output in onnx_model.graph.output)
+            input_devices = _get_onnx_devices(args)
+            # Cache devices for inputs and outputs. They are used to invoke
+            # ORT session. Output devices indicate where (e.g., GPU or CPU)
+            # to store outputs
+            if isinstance(prim_outputs, tuple):
+                output_devices = _get_onnx_devices(prim_outputs)
+            else:
+                output_devices = _get_onnx_devices((prim_outputs,))
+
+            input_value_infos = tuple(input for input in onnx_model.graph.input)
+            output_value_infos = tuple(output for output in onnx_model.graph.output)
+
+            execution_info_per_session = OrtExecutionInfoPerSession(
+                session=onnx_session,
+                input_names=input_names,
+                input_value_infos=input_value_infos,
+                output_names=output_names,
+                output_value_infos=output_value_infos,
+                input_devices=input_devices,
+                output_devices=output_devices,
+                example_outputs=prim_outputs,
+            )
+
+            self._all_ort_execution_info.cache_session_execution_info(
+                graph_module, execution_info_per_session
+            )
+
+        self.execution_count += 1
+
+        # ORT always returns a tuple of outputs. If the original output is a tensor,
+        # ORT output's first element must be extracted and returned. Otherwise, type
+        # mismatch may happen in downstream computation.
+        is_single_tensor_output = isinstance(prim_outputs, torch.Tensor)
+        normalized_prim_outputs = (
+            (prim_outputs,) if is_single_tensor_output else prim_outputs
+        )
+        assert isinstance(normalized_prim_outputs, tuple)
+        assert all(
+            isinstance(elem, (torch.Tensor, torch.SymInt, int))
+            for elem in normalized_prim_outputs
+        )
+
+        _nvtx_range_push("run_onnx_session_with_ortvaluevector")
+        onnx_outputs = self.run(
+            onnx_session,
+            input_names,
+            args,
+            input_devices,
+            output_names,
+            normalized_prim_outputs,
+            output_devices,
+            self._options.preallocate_output,
+            input_value_infos,
+            normalized_prim_outputs,
+        )
+        _nvtx_range_pop()
+
+        if self._assert_allclose_to_baseline:
+            # Compute baseline.
+            baseline_outputs = torch._prims.executor.execute(
+                graph_module, *args, executor="aten"
+            )
+            normalized_baseline_ouptuts = (
+                (baseline_outputs,) if is_single_tensor_output else baseline_outputs
+            )
+            # Ensure every output tensor is close to the corresponding baseline.
+            for onnx_output, baseline_output in zip(
+                onnx_outputs, normalized_baseline_ouptuts
+            ):
+                torch.testing.assert_close(onnx_output, baseline_output)
+        return onnx_outputs[0] if is_single_tensor_output else onnx_outputs
+
+    def compile(self, graph_module: torch.fx.GraphModule, args) -> torch.fx.GraphModule:
+        # Deferred import since CapabilityBasedPartitioner is not decorated with
+        # @compatibility; importing it at the module level will result in the test
+        # failing: pytest test/test_fx.py -k test_public_api_surface
+        # because this module is imported into torch.onnx.
+        from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+
+        # FX graph based partitioning based on ONNX supported ops.
+        # Given a graph module
+        #  GraphModule0
+        #   node_0
+        #   node_1
+        #   node_2
+        #   node_3
+        #   node_4
+        # If only node_2 is not supported by ONNX, this graph module will be partitioned into
+        #  GraphModule0
+        #   GraphModule1
+        #    node_0
+        #    node_1
+        #   node_2
+        #   GraphModule2
+        #    node_3
+        #    node_4
+        # by calling CapabilityBasedPartitioner.partition_and_fuse.
+        # Then, GraphModule1's and GraphModule2's forward method (GraphModule._wrapped_call)
+        # will be replaced by OrtBackend._ort_accelerated_call to delegate computation to ORT.
+        if graph_module in self._partitioner_cache:
+            partitioned_prim_graph_module = self._partitioner_cache[graph_module]
+        else:
+            prim_graph_module = graph_module
+            partitioner = CapabilityBasedPartitioner(
+                prim_graph_module,
+                self._supported_ops,
+                allows_single_node_partition=True,
+            )
+            partitioned_prim_graph_module = partitioner.partition_and_fuse()
+            self._partitioner_cache[graph_module] = partitioned_prim_graph_module
+
+            # Overriding fused_module's __call__() function with ort_acclerated_call()
+            # This loop goes through all graph partitions (each of them is an ONNX-representable graph)
+            # and override their _wrapped_call function with _ort_accelerated_call.
+            # Inside _ort_accelerated_call, the partition's graph is exported into ONNX and executed by ORT.
+            for node in partitioned_prim_graph_module.graph.nodes:
+                # TODO(wschin): use a better way to identify fused submodule
+                # See https://github.com/pytorch/pytorch/issues/106872.
+                if node.op == "call_module" and "fused_" in node.name:
+                    fused_module = getattr(partitioned_prim_graph_module, node.name)
+                    # self.ort_acclerated_call is responsible for exporting graph to ONNX,
+                    # creating ORT session, and running ORT session.
+                    fused_module._wrapped_call = self._ort_acclerated_call
+
+        return partitioned_prim_graph_module
+
+    def __call__(
+        self, graph_module: torch.fx.GraphModule, args
+    ) -> torch.fx.GraphModule:
+        """If ``OrtBackendOptions.use_aot_autograd`` is ``True``, the `auto_autograd` compiler
+        will be invoked, wrapping this ``OrtBackend`` instance's ``compile`` method. Otherwise,
+        the ``compile`` method is invoked directly."""
+        if self._options.use_aot_autograd:
+            from functorch.compile import min_cut_rematerialization_partition
+            from torch._dynamo.backends.common import aot_autograd
+
+            return aot_autograd(
+                fw_compiler=self.compile,
+                partition_fn=min_cut_rematerialization_partition,
+                decompositions=self._resolved_onnx_exporter_options.decomposition_table,
+            )(graph_module, args)
+
+        return self.compile(graph_module, args)
+
+    __instance_cache_max_count: Final = 8
+    __instance_cache: Final[List["OrtBackend"]] = []
+
+    @staticmethod
+    def get_cached_instance_for_options(
+        options: Optional[Union[OrtBackendOptions, Mapping[str, Any]]] = None,
+    ) -> "OrtBackend":
+        """Returns a possibly cached instance of an ``OrtBackend``. If an existing
+        backend was created previously through this function with the same options,
+        it will be returned. Otherwise a new backend will be created, cached, and
+        returned.
+
+        Note: if ``options`` sets ``ort_session_options``, a new ``OrtBackend``
+        will always be returned, since ``onnxruntime.SessionOptions`` cannot
+        participate in caching."""
+
+        def reusable(a: OrtBackendOptions, b: OrtBackendOptions):
+            if (
+                a.preferred_execution_providers != b.preferred_execution_providers
+                or a.infer_execution_providers != b.infer_execution_providers
+                or a.default_execution_providers != b.default_execution_providers
+                or a.preallocate_output != b.preallocate_output
+                or a.use_aot_autograd != b.use_aot_autograd
+                or a.pre_ort_model_transforms != b.pre_ort_model_transforms
+            ):
+                return False
+
+            # onnxruntime.SessionOptions is a pybind11 object, cannot be pickled,
+            # and holds too much potential state to reasonably check manually;
+            # ort_session_options is provided at all, the backend does not participate
+            # in caching.
+            if a.ort_session_options is not None or b.ort_session_options is not None:
+                return False
+
+            if a.export_options is b.export_options:
+                return True
+
+            # Similarly, some objects in ExportOptions are too stateful to use for
+            # caching. We should revisit this.
+            if a.export_options is not None and b.export_options is not None:
+                return (
+                    a.export_options.dynamic_shapes == b.export_options.dynamic_shapes
+                    and a.export_options.diagnostic_options
+                    == b.export_options.diagnostic_options
+                    and a.export_options.onnx_registry is b.export_options.onnx_registry
+                    and a.export_options.fake_context is b.export_options.fake_context
+                )
+
+            # We can't account for how the two option sets may differ, so it's not safe to reuse.
+            return False
+
+        if not isinstance(options, OrtBackendOptions):
+            options = OrtBackendOptions(**(options or {}))
+
+        backend = next(
+            (b for b in OrtBackend.__instance_cache if reusable(b._options, options)),
+            None,
+        )
+
+        if backend is None:
+            assert (
+                len(OrtBackend.__instance_cache) < OrtBackend.__instance_cache_max_count
+            ), (
+                f"No more than {OrtBackend.__instance_cache_max_count} instances of "
+                f"{OrtBackend} allowed. Please instantiate `{OrtBackend}` explicitly "
+                "to pass to `torch.compile`. "
+                "See https://github.com/pytorch/pytorch/pull/107973#discussion_r1306144795 "
+                "for discussion."
+            )
+            OrtBackend.__instance_cache.append(backend := OrtBackend(options))
+
+        return backend
+
+    @staticmethod
+    def clear_cached_instances():
+        OrtBackend.__instance_cache.clear()
+
+    @staticmethod
+    def get_cached_instances():
+        return tuple(OrtBackend.__instance_cache)
+
+
+@compatibility(is_backward_compatible=False)
+def torch_compile_backend(
+    graph_module: torch.fx.GraphModule,
+    args,
+    *,
+    options: Optional[Union[OrtBackendOptions, Mapping[str, Any]]] = None,
+):
+    return OrtBackend.get_cached_instance_for_options(options)(graph_module, args)
diff --git a/.venv/Lib/site-packages/torch/onnx/_internal/registration.py b/.venv/Lib/site-packages/torch/onnx/_internal/registration.py
new file mode 100644
index 0000000000000000000000000000000000000000..1217a993aff07146c1ce4a9c70f7db46b94e7f78
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_internal/registration.py
@@ -0,0 +1,341 @@
+# mypy: allow-untyped-defs
+"""Module for handling symbolic function registration."""
+
+import warnings
+from typing import (
+    Callable,
+    Collection,
+    Dict,
+    Generic,
+    Optional,
+    Sequence,
+    Set,
+    TypeVar,
+    Union,
+)
+
+from torch.onnx import _constants, errors
+
+
+OpsetVersion = int
+
+
+def _dispatch_opset_version(
+    target: OpsetVersion, registered_opsets: Collection[OpsetVersion]
+) -> Optional[OpsetVersion]:
+    """Finds the registered opset given a target opset version and the available opsets.
+
+    Args:
+        target: The target opset version.
+        registered_opsets: The available opsets.
+
+    Returns:
+        The registered opset version.
+    """
+    if not registered_opsets:
+        return None
+
+    descending_registered_versions = sorted(registered_opsets, reverse=True)
+    # Linear search for the opset version, which is fine since the number of opset
+    # versions is small.
+
+    if target >= _constants.ONNX_BASE_OPSET:
+        # Always look down toward opset 1 when the target is >= ONNX_BASE_OPSET (opset 9).
+        # When a custom op is register at opset 1, we want to be able to discover it as a
+        # fallback for all opsets >= ONNX_BASE_OPSET.
+        for version in descending_registered_versions:
+            if version <= target:
+                return version
+        return None
+
+    # target < opset 9. This is the legacy behavior to support opset 7 and opset 8.
+    # for caffe2 support. We search up toward opset 9.
+    for version in reversed(descending_registered_versions):
+        # Count back up until _constants.ONNX_BASE_OPSET
+        if target <= version <= _constants.ONNX_BASE_OPSET:
+            return version
+
+    return None
+
+
+_K = TypeVar("_K")
+_V = TypeVar("_V")
+
+
+class OverrideDict(Collection[_K], Generic[_K, _V]):
+    """A dictionary that merges built-in and custom symbolic functions.
+
+    It supports overriding and un-overriding built-in symbolic functions with custom
+    ones.
+    """
+
+    def __init__(self) -> None:
+        self._base: Dict[_K, _V] = {}
+        self._overrides: Dict[_K, _V] = {}
+        self._merged: Dict[_K, _V] = {}
+
+    def set_base(self, key: _K, value: _V) -> None:
+        self._base[key] = value
+        if key not in self._overrides:
+            self._merged[key] = value
+
+    def in_base(self, key: _K) -> bool:
+        """Checks if a key is in the base dictionary."""
+        return key in self._base
+
+    def override(self, key: _K, value: _V) -> None:
+        """Overrides a base key-value with a new pair."""
+        self._overrides[key] = value
+        self._merged[key] = value
+
+    def remove_override(self, key: _K) -> None:
+        """Un-overrides a key-value pair."""
+        self._overrides.pop(key, None)  # type: ignore[arg-type]
+        self._merged.pop(key, None)  # type: ignore[arg-type]
+        if key in self._base:
+            self._merged[key] = self._base[key]
+
+    def overridden(self, key: _K) -> bool:
+        """Checks if a key-value pair is overridden."""
+        return key in self._overrides
+
+    def __getitem__(self, key: _K) -> _V:
+        return self._merged[key]
+
+    def get(self, key: _K, default: Optional[_V] = None):
+        return self._merged.get(key, default)
+
+    def __contains__(self, key: object) -> bool:
+        return key in self._merged
+
+    def __iter__(self):
+        return iter(self._merged)
+
+    def __len__(self) -> int:
+        return len(self._merged)
+
+    def __repr__(self) -> str:
+        return f"OverrideDict(base={self._base}, overrides={self._overrides})"
+
+    def __bool__(self) -> bool:
+        return bool(self._merged)
+
+
+class _SymbolicFunctionGroup:
+    """Different versions of symbolic functions registered to the same name.
+
+    O(number of registered versions of an op) search is performed to find the most
+    recent version of the op.
+
+    The registration is delayed until op is used to improve startup time.
+
+    Function overloads with different arguments are not allowed.
+    Custom op overrides are supported.
+    """
+
+    def __init__(self, name: str) -> None:
+        self._name = name
+        # A dictionary of functions, keyed by the opset version.
+        self._functions: OverrideDict[OpsetVersion, Callable] = OverrideDict()
+
+    def __repr__(self) -> str:
+        return f"_SymbolicFunctionGroup({self._name}, registered={self._functions})"
+
+    def __getitem__(self, key: OpsetVersion) -> Callable:
+        result = self.get(key)
+        if result is None:
+            raise KeyError(key)
+        return result
+
+    # TODO(justinchuby): Add @functools.lru_cache(maxsize=None) if lookup time becomes
+    # a problem.
+    def get(self, opset: OpsetVersion) -> Optional[Callable]:
+        """Find the most recent version of the function."""
+        version = _dispatch_opset_version(opset, self._functions)
+        if version is None:
+            return None
+
+        return self._functions[version]
+
+    def add(self, func: Callable, opset: OpsetVersion) -> None:
+        """Adds a symbolic function.
+
+        Args:
+            func: The function to add.
+            opset: The opset version of the function to add.
+        """
+        if self._functions.in_base(opset):
+            warnings.warn(
+                f"Symbolic function '{self._name}' already registered for opset {opset}. "
+                f"Replacing the existing function with new function. This is unexpected. "
+                f"Please report it on {_constants.PYTORCH_GITHUB_ISSUES_URL}.",
+                errors.OnnxExporterWarning,
+            )
+        self._functions.set_base(opset, func)
+
+    def add_custom(self, func: Callable, opset: OpsetVersion) -> None:
+        """Adds a custom symbolic function.
+
+        Args:
+            func: The symbolic function to register.
+            opset: The corresponding opset version.
+        """
+        self._functions.override(opset, func)
+
+    def remove_custom(self, opset: OpsetVersion) -> None:
+        """Removes a custom symbolic function.
+
+        Args:
+            opset: The opset version of the custom function to remove.
+        """
+        if not self._functions.overridden(opset):
+            warnings.warn(
+                f"No custom function registered for '{self._name}' opset {opset}"
+            )
+            return
+        self._functions.remove_override(opset)
+
+    def get_min_supported(self) -> OpsetVersion:
+        """Returns the lowest built-in opset version supported by the function."""
+        return min(self._functions)
+
+
+class SymbolicRegistry:
+    """Registry for symbolic functions.
+
+    The registry maintains a mapping from qualified names to symbolic functions.
+    It is used to register new symbolic functions and to dispatch calls to
+    the appropriate function.
+    """
+
+    def __init__(self) -> None:
+        self._registry: Dict[str, _SymbolicFunctionGroup] = {}
+
+    def register(
+        self, name: str, opset: OpsetVersion, func: Callable, custom: bool = False
+    ) -> None:
+        """Registers a symbolic function.
+
+        Args:
+            name: The qualified name of the function to register. In the form of 'domain::op'.
+                E.g. 'aten::add'.
+            opset: The opset version of the function to register.
+            func: The symbolic function to register.
+            custom: Whether the function is a custom function that overrides existing ones.
+
+        Raises:
+            ValueError: If the separator '::' is not in the name.
+        """
+        if "::" not in name:
+            raise ValueError(
+                f"The name must be in the form of 'domain::op', not '{name}'"
+            )
+        symbolic_functions = self._registry.setdefault(
+            name, _SymbolicFunctionGroup(name)
+        )
+        if custom:
+            symbolic_functions.add_custom(func, opset)
+        else:
+            symbolic_functions.add(func, opset)
+
+    def unregister(self, name: str, opset: OpsetVersion) -> None:
+        """Unregisters a symbolic function.
+
+        Args:
+            name: The qualified name of the function to unregister.
+            opset: The opset version of the function to unregister.
+        """
+        if name not in self._registry:
+            return
+        self._registry[name].remove_custom(opset)
+
+    def get_function_group(self, name: str) -> Optional[_SymbolicFunctionGroup]:
+        """Returns the function group for the given name."""
+        return self._registry.get(name)
+
+    def is_registered_op(self, name: str, version: int) -> bool:
+        """Returns whether the given op is registered for the given opset version."""
+        functions = self.get_function_group(name)
+        if functions is None:
+            return False
+        return functions.get(version) is not None
+
+    def all_functions(self) -> Set[str]:
+        """Returns the set of all registered function names."""
+        return set(self._registry)
+
+
+def onnx_symbolic(
+    name: str,
+    opset: Union[OpsetVersion, Sequence[OpsetVersion]],
+    decorate: Optional[Sequence[Callable]] = None,
+    custom: bool = False,
+) -> Callable:
+    """Registers a symbolic function.
+
+    Usage::
+
+    ```
+    @onnx_symbolic(
+        "aten::symbolic_b",
+        opset=10,
+        decorate=[quantized_aten_handler(scale=1 / 128, zero_point=0)],
+    )
+    @symbolic_helper.parse_args("v", "v", "b")
+    def symbolic_b(g: _C.Graph, x: _C.Value, y: _C.Value, arg1: bool) -> _C.Value: ...
+    ```
+
+    Args:
+        name: The qualified name of the function in the form of 'domain::op'.
+            E.g. 'aten::add'.
+        opset: The opset versions of the function to register at.
+        decorate: A sequence of decorators to apply to the function.
+        custom: Whether the function is a custom symbolic function.
+
+    Raises:
+        ValueError: If the separator '::' is not in the name.
+    """
+
+    def wrapper(func: Callable) -> Callable:
+        decorated = func
+        if decorate is not None:
+            for decorate_func in decorate:
+                decorated = decorate_func(decorated)
+
+        global registry
+        nonlocal opset
+        if isinstance(opset, OpsetVersion):
+            opset = (opset,)
+        for opset_version in opset:
+            registry.register(name, opset_version, decorated, custom=custom)
+
+        # Return the original function because the decorators in "decorate" are only
+        # specific to the instance being registered.
+        return func
+
+    return wrapper
+
+
+def custom_onnx_symbolic(
+    name: str,
+    opset: Union[OpsetVersion, Sequence[OpsetVersion]],
+    decorate: Optional[Sequence[Callable]] = None,
+) -> Callable:
+    """Registers a custom symbolic function.
+
+    Args:
+        name: the qualified name of the function.
+        opset: the opset version of the function.
+        decorate: a sequence of decorators to apply to the function.
+
+    Returns:
+        The decorator.
+
+    Raises:
+        ValueError: If the separator '::' is not in the name.
+    """
+    return onnx_symbolic(name, opset, decorate, custom=True)
+
+
+# The registry for all symbolic functions.
+registry = SymbolicRegistry()
diff --git a/.venv/Lib/site-packages/torch/onnx/_onnx_supported_ops.py b/.venv/Lib/site-packages/torch/onnx/_onnx_supported_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..57d40f741a3dc734a1322137451277807d5dc5af
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_onnx_supported_ops.py
@@ -0,0 +1,98 @@
+# mypy: allow-untyped-defs
+import inspect
+from typing import Dict, List, Union
+
+from torch import _C
+from torch.onnx import _constants
+from torch.onnx._internal import registration
+
+
+class _TorchSchema:
+    def __init__(self, schema: Union[_C.FunctionSchema, str]) -> None:
+        if isinstance(schema, _C.FunctionSchema):
+            self.name: str = schema.name
+            self.overload_name: str = schema.overload_name
+            self.arguments: List[str] = [arg.name for arg in schema.arguments]
+            self.optional_arguments: List[str] = []
+            self.returns: List[str] = [ret.name for ret in schema.returns]
+            self.opsets: List[int] = []
+        else:
+            self.name = schema
+            self.overload_name = ""
+            self.arguments = []
+            self.optional_arguments = []
+            self.returns = []
+            self.opsets = []
+
+    def __str__(self) -> str:
+        s = (
+            f"{self.name}.{self.overload_name}("
+            + ", ".join(self.arguments)
+            + ") -> ("
+            + ", ".join(self.returns)
+            + ")"
+            + " in opsets "
+            + ", ".join(str(opset) for opset in self.opsets)
+        )
+        return s
+
+    def __hash__(self):
+        # TODO(thiagocrepaldi): handle overload_name?
+        return hash(self.name)
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, _TorchSchema):
+            return False
+        # TODO(thiagocrepaldi): handle overload_name?
+        return self.name == other.name
+
+    def is_aten(self) -> bool:
+        return self.name.startswith("aten::")
+
+    def is_backward(self) -> bool:
+        return "backward" in self.name
+
+
+def _symbolic_argument_count(func):
+    params = []
+    signature = inspect.signature(func)
+    optional_params = []
+    for name, parameter in signature.parameters.items():
+        if name in {"_outputs", "g"}:
+            continue
+        if parameter.default is parameter.empty:
+            optional_params.append(parameter)
+        else:
+            params.append(str(parameter))
+    return params
+
+
+def all_forward_schemas() -> Dict[str, _TorchSchema]:
+    """Returns schemas for all TorchScript forward ops."""
+    torch_schemas = [_TorchSchema(s) for s in _C._jit_get_all_schemas()]
+    return {schema.name: schema for schema in torch_schemas if not schema.is_backward()}
+
+
+def all_symbolics_schemas() -> Dict[str, _TorchSchema]:
+    """Returns schemas for all onnx supported ops."""
+    symbolics_schemas = {}
+
+    for name in registration.registry.all_functions():
+        func_group = registration.registry.get_function_group(name)
+        assert func_group is not None
+        symbolics_schema = _TorchSchema(name)
+        func = func_group.get(_constants.ONNX_MAX_OPSET)
+        if func is not None:
+            symbolics_schema.arguments = _symbolic_argument_count(func)
+            symbolics_schema.opsets = list(
+                range(func_group.get_min_supported(), _constants.ONNX_MAX_OPSET + 1)
+            )
+        else:
+            # Only support opset < 9
+            func = func_group.get(7)
+            symbolics_schema.arguments = _symbolic_argument_count(func)
+            symbolics_schema.opsets = list(range(7, _constants.ONNX_BASE_OPSET))
+
+        symbolics_schemas[name] = symbolics_schema
+
+    return symbolics_schemas
diff --git a/.venv/Lib/site-packages/torch/onnx/_type_utils.py b/.venv/Lib/site-packages/torch/onnx/_type_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bf8379e440bde853917526224019659f6b5b972
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/_type_utils.py
@@ -0,0 +1,391 @@
+# mypy: allow-untyped-defs
+"""Utilities for converting and operating on ONNX, JIT and torch types."""
+
+from __future__ import annotations
+
+import enum
+import typing
+from typing import Literal
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch.onnx import errors
+
+
+if typing.TYPE_CHECKING:
+    # Hack to help mypy to recognize torch._C.Value
+    from torch import _C  # noqa: F401
+
+ScalarName = Literal[
+    "Byte",
+    "Char",
+    "Double",
+    "Float",
+    "Half",
+    "Int",
+    "Long",
+    "Short",
+    "Bool",
+    "ComplexHalf",
+    "ComplexFloat",
+    "ComplexDouble",
+    "QInt8",
+    "QUInt8",
+    "QInt32",
+    "BFloat16",
+    "Float8E5M2",
+    "Float8E4M3FN",
+    "Float8E5M2FNUZ",
+    "Float8E4M3FNUZ",
+    "Undefined",
+]
+
+TorchName = Literal[
+    "bool",
+    "uint8_t",
+    "int8_t",
+    "double",
+    "float",
+    "half",
+    "int",
+    "int64_t",
+    "int16_t",
+    "complex32",
+    "complex64",
+    "complex128",
+    "qint8",
+    "quint8",
+    "qint32",
+    "bfloat16",
+    "float8_e5m2",
+    "float8_e4m3fn",
+    "float8_e5m2fnuz",
+    "float8_e4m3fnuz",
+]
+
+
+class JitScalarType(enum.IntEnum):
+    """Scalar types defined in torch.
+
+    Use ``JitScalarType`` to convert from torch and JIT scalar types to ONNX scalar types.
+
+    Examples:
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> # xdoctest: +IGNORE_WANT("win32 has different output")
+        >>> JitScalarType.from_value(torch.ones(1, 2)).onnx_type()
+        TensorProtoDataType.FLOAT
+
+        >>> JitScalarType.from_value(torch_c_value_with_type_float).onnx_type()
+        TensorProtoDataType.FLOAT
+
+        >>> JitScalarType.from_dtype(torch.get_default_dtype).onnx_type()
+        TensorProtoDataType.FLOAT
+
+    """
+
+    # Order defined in https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
+    UINT8 = 0
+    INT8 = enum.auto()  # 1
+    INT16 = enum.auto()  # 2
+    INT = enum.auto()  # 3
+    INT64 = enum.auto()  # 4
+    HALF = enum.auto()  # 5
+    FLOAT = enum.auto()  # 6
+    DOUBLE = enum.auto()  # 7
+    COMPLEX32 = enum.auto()  # 8
+    COMPLEX64 = enum.auto()  # 9
+    COMPLEX128 = enum.auto()  # 10
+    BOOL = enum.auto()  # 11
+    QINT8 = enum.auto()  # 12
+    QUINT8 = enum.auto()  # 13
+    QINT32 = enum.auto()  # 14
+    BFLOAT16 = enum.auto()  # 15
+    FLOAT8E5M2 = enum.auto()  # 16
+    FLOAT8E4M3FN = enum.auto()  # 17
+    FLOAT8E5M2FNUZ = enum.auto()  # 18
+    FLOAT8E4M3FNUZ = enum.auto()  # 19
+    UNDEFINED = enum.auto()  # 20
+
+    @classmethod
+    def _from_name(cls, name: ScalarName | TorchName | str | None) -> JitScalarType:
+        """Convert a JIT scalar type or torch type name to ScalarType.
+
+        Note: DO NOT USE this API when `name` comes from a `torch._C.Value.type()` calls.
+            A "RuntimeError: INTERNAL ASSERT FAILED at "../aten/src/ATen/core/jit_type_base.h" can
+            be raised in several scenarios where shape info is not present.
+            Instead use `from_value` API which is safer.
+
+        Args:
+            name: JIT scalar type name (Byte) or torch type name (uint8_t).
+
+        Returns:
+            JitScalarType
+
+        Raises:
+           OnnxExporterError: if name is not a valid scalar type name or if it is None.
+        """
+        if name is None:
+            raise errors.OnnxExporterError("Scalar type name cannot be None")
+        if valid_scalar_name(name):
+            return _SCALAR_NAME_TO_TYPE[name]  # type: ignore[index]
+        if valid_torch_name(name):
+            return _TORCH_NAME_TO_SCALAR_TYPE[name]  # type: ignore[index]
+
+        raise errors.OnnxExporterError(f"Unknown torch or scalar type: '{name}'")
+
+    @classmethod
+    def from_dtype(cls, dtype: torch.dtype | None) -> JitScalarType:
+        """Convert a torch dtype to JitScalarType.
+
+        Note: DO NOT USE this API when `dtype` comes from a `torch._C.Value.type()` calls.
+            A "RuntimeError: INTERNAL ASSERT FAILED at "../aten/src/ATen/core/jit_type_base.h" can
+            be raised in several scenarios where shape info is not present.
+            Instead use `from_value` API which is safer.
+
+        Args:
+            dtype: A torch.dtype to create a JitScalarType from
+
+        Returns:
+            JitScalarType
+
+        Raises:
+            OnnxExporterError: if dtype is not a valid torch.dtype or if it is None.
+        """
+        if dtype not in _DTYPE_TO_SCALAR_TYPE:
+            raise errors.OnnxExporterError(f"Unknown dtype: {dtype}")
+        return _DTYPE_TO_SCALAR_TYPE[dtype]
+
+    @classmethod
+    def from_onnx_type(
+        cls, onnx_type: int | _C_onnx.TensorProtoDataType | None
+    ) -> JitScalarType:
+        """Convert a ONNX data type to JitScalarType.
+
+        Args:
+            onnx_type: A torch._C._onnx.TensorProtoDataType to create a JitScalarType from
+
+        Returns:
+            JitScalarType
+
+        Raises:
+            OnnxExporterError: if dtype is not a valid torch.dtype or if it is None.
+        """
+        if onnx_type not in _ONNX_TO_SCALAR_TYPE:
+            raise errors.OnnxExporterError(f"Unknown onnx_type: {onnx_type}")
+        return _ONNX_TO_SCALAR_TYPE[typing.cast(_C_onnx.TensorProtoDataType, onnx_type)]
+
+    @classmethod
+    def from_value(
+        cls, value: None | torch._C.Value | torch.Tensor, default=None
+    ) -> JitScalarType:
+        """Create a JitScalarType from an value's scalar type.
+
+        Args:
+            value: An object to fetch scalar type from.
+            default: The JitScalarType to return if a valid scalar cannot be fetched from value
+
+        Returns:
+            JitScalarType.
+
+        Raises:
+            OnnxExporterError: if value does not have a valid scalar type and default is None.
+            SymbolicValueError: when value.type()'s info are empty and default is None
+        """
+
+        if not isinstance(value, (torch._C.Value, torch.Tensor)) or (
+            isinstance(value, torch._C.Value) and value.node().mustBeNone()
+        ):
+            # default value of type JitScalarType is returned when value is not valid
+            if default is None:
+                raise errors.OnnxExporterError(
+                    "value must be either torch._C.Value or torch.Tensor objects."
+                )
+            elif not isinstance(default, JitScalarType):
+                raise errors.OnnxExporterError(
+                    "default value must be a JitScalarType object."
+                )
+            return default
+
+        # Each value type has their own way of storing scalar type
+        if isinstance(value, torch.Tensor):
+            return cls.from_dtype(value.dtype)
+        if isinstance(value.type(), torch.ListType):
+            try:
+                return cls.from_dtype(value.type().getElementType().dtype())
+            except RuntimeError:
+                return cls._from_name(str(value.type().getElementType()))
+        if isinstance(value.type(), torch._C.OptionalType):
+            if value.type().getElementType().dtype() is None:
+                if isinstance(default, JitScalarType):
+                    return default
+                raise errors.OnnxExporterError(
+                    "default value must be a JitScalarType object."
+                )
+            return cls.from_dtype(value.type().getElementType().dtype())
+
+        scalar_type = None
+        if value.node().kind() != "prim::Constant" or not isinstance(
+            value.type(), torch._C.NoneType
+        ):
+            # value must be a non-list torch._C.Value scalar
+            scalar_type = value.type().scalarType()
+
+        if scalar_type is not None:
+            return cls._from_name(scalar_type)
+
+        # When everything fails... try to default
+        if default is not None:
+            return default
+        raise errors.SymbolicValueError(
+            f"Cannot determine scalar type for this '{type(value.type())}' instance and "
+            "a default value was not provided.",
+            value,
+        )
+
+    def scalar_name(self) -> ScalarName:
+        """Convert a JitScalarType to a JIT scalar type name."""
+        return _SCALAR_TYPE_TO_NAME[self]
+
+    def torch_name(self) -> TorchName:
+        """Convert a JitScalarType to a torch type name."""
+        return _SCALAR_TYPE_TO_TORCH_NAME[self]
+
+    def dtype(self) -> torch.dtype:
+        """Convert a JitScalarType to a torch dtype."""
+        return _SCALAR_TYPE_TO_DTYPE[self]
+
+    def onnx_type(self) -> _C_onnx.TensorProtoDataType:
+        """Convert a JitScalarType to an ONNX data type."""
+        if self not in _SCALAR_TYPE_TO_ONNX:
+            raise errors.OnnxExporterError(
+                f"Scalar type {self} cannot be converted to ONNX"
+            )
+        return _SCALAR_TYPE_TO_ONNX[self]
+
+    def onnx_compatible(self) -> bool:
+        """Return whether this JitScalarType is compatible with ONNX."""
+        return (
+            self in _SCALAR_TYPE_TO_ONNX
+            and self != JitScalarType.UNDEFINED
+            and self != JitScalarType.COMPLEX32
+        )
+
+
+def valid_scalar_name(scalar_name: ScalarName | str) -> bool:
+    """Return whether the given scalar name is a valid JIT scalar type name."""
+    return scalar_name in _SCALAR_NAME_TO_TYPE
+
+
+def valid_torch_name(torch_name: TorchName | str) -> bool:
+    """Return whether the given torch name is a valid torch type name."""
+    return torch_name in _TORCH_NAME_TO_SCALAR_TYPE
+
+
+# https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
+_SCALAR_TYPE_TO_NAME: dict[JitScalarType, ScalarName] = {
+    JitScalarType.BOOL: "Bool",
+    JitScalarType.UINT8: "Byte",
+    JitScalarType.INT8: "Char",
+    JitScalarType.INT16: "Short",
+    JitScalarType.INT: "Int",
+    JitScalarType.INT64: "Long",
+    JitScalarType.HALF: "Half",
+    JitScalarType.FLOAT: "Float",
+    JitScalarType.DOUBLE: "Double",
+    JitScalarType.COMPLEX32: "ComplexHalf",
+    JitScalarType.COMPLEX64: "ComplexFloat",
+    JitScalarType.COMPLEX128: "ComplexDouble",
+    JitScalarType.QINT8: "QInt8",
+    JitScalarType.QUINT8: "QUInt8",
+    JitScalarType.QINT32: "QInt32",
+    JitScalarType.BFLOAT16: "BFloat16",
+    JitScalarType.FLOAT8E5M2: "Float8E5M2",
+    JitScalarType.FLOAT8E4M3FN: "Float8E4M3FN",
+    JitScalarType.FLOAT8E5M2FNUZ: "Float8E5M2FNUZ",
+    JitScalarType.FLOAT8E4M3FNUZ: "Float8E4M3FNUZ",
+    JitScalarType.UNDEFINED: "Undefined",
+}
+
+_SCALAR_NAME_TO_TYPE: dict[ScalarName, JitScalarType] = {
+    v: k for k, v in _SCALAR_TYPE_TO_NAME.items()
+}
+
+_SCALAR_TYPE_TO_TORCH_NAME: dict[JitScalarType, TorchName] = {
+    JitScalarType.BOOL: "bool",
+    JitScalarType.UINT8: "uint8_t",
+    JitScalarType.INT8: "int8_t",
+    JitScalarType.INT16: "int16_t",
+    JitScalarType.INT: "int",
+    JitScalarType.INT64: "int64_t",
+    JitScalarType.HALF: "half",
+    JitScalarType.FLOAT: "float",
+    JitScalarType.DOUBLE: "double",
+    JitScalarType.COMPLEX32: "complex32",
+    JitScalarType.COMPLEX64: "complex64",
+    JitScalarType.COMPLEX128: "complex128",
+    JitScalarType.QINT8: "qint8",
+    JitScalarType.QUINT8: "quint8",
+    JitScalarType.QINT32: "qint32",
+    JitScalarType.BFLOAT16: "bfloat16",
+    JitScalarType.FLOAT8E5M2: "float8_e5m2",
+    JitScalarType.FLOAT8E4M3FN: "float8_e4m3fn",
+    JitScalarType.FLOAT8E5M2FNUZ: "float8_e5m2fnuz",
+    JitScalarType.FLOAT8E4M3FNUZ: "float8_e4m3fnuz",
+}
+
+_TORCH_NAME_TO_SCALAR_TYPE: dict[TorchName, JitScalarType] = {
+    v: k for k, v in _SCALAR_TYPE_TO_TORCH_NAME.items()
+}
+
+_SCALAR_TYPE_TO_ONNX = {
+    JitScalarType.BOOL: _C_onnx.TensorProtoDataType.BOOL,
+    JitScalarType.UINT8: _C_onnx.TensorProtoDataType.UINT8,
+    JitScalarType.INT8: _C_onnx.TensorProtoDataType.INT8,
+    JitScalarType.INT16: _C_onnx.TensorProtoDataType.INT16,
+    JitScalarType.INT: _C_onnx.TensorProtoDataType.INT32,
+    JitScalarType.INT64: _C_onnx.TensorProtoDataType.INT64,
+    JitScalarType.HALF: _C_onnx.TensorProtoDataType.FLOAT16,
+    JitScalarType.FLOAT: _C_onnx.TensorProtoDataType.FLOAT,
+    JitScalarType.DOUBLE: _C_onnx.TensorProtoDataType.DOUBLE,
+    JitScalarType.COMPLEX64: _C_onnx.TensorProtoDataType.COMPLEX64,
+    JitScalarType.COMPLEX128: _C_onnx.TensorProtoDataType.COMPLEX128,
+    JitScalarType.BFLOAT16: _C_onnx.TensorProtoDataType.BFLOAT16,
+    JitScalarType.UNDEFINED: _C_onnx.TensorProtoDataType.UNDEFINED,
+    JitScalarType.COMPLEX32: _C_onnx.TensorProtoDataType.UNDEFINED,
+    JitScalarType.QINT8: _C_onnx.TensorProtoDataType.INT8,
+    JitScalarType.QUINT8: _C_onnx.TensorProtoDataType.UINT8,
+    JitScalarType.QINT32: _C_onnx.TensorProtoDataType.INT32,
+    JitScalarType.FLOAT8E5M2: _C_onnx.TensorProtoDataType.FLOAT8E5M2,
+    JitScalarType.FLOAT8E4M3FN: _C_onnx.TensorProtoDataType.FLOAT8E4M3FN,
+    JitScalarType.FLOAT8E5M2FNUZ: _C_onnx.TensorProtoDataType.FLOAT8E5M2FNUZ,
+    JitScalarType.FLOAT8E4M3FNUZ: _C_onnx.TensorProtoDataType.FLOAT8E4M3FNUZ,
+}
+
+_ONNX_TO_SCALAR_TYPE = {v: k for k, v in _SCALAR_TYPE_TO_ONNX.items()}
+
+# source of truth is
+# https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_dtypes.cpp
+_SCALAR_TYPE_TO_DTYPE = {
+    JitScalarType.BOOL: torch.bool,
+    JitScalarType.UINT8: torch.uint8,
+    JitScalarType.INT8: torch.int8,
+    JitScalarType.INT16: torch.short,
+    JitScalarType.INT: torch.int,
+    JitScalarType.INT64: torch.int64,
+    JitScalarType.HALF: torch.half,
+    JitScalarType.FLOAT: torch.float,
+    JitScalarType.DOUBLE: torch.double,
+    JitScalarType.COMPLEX32: torch.complex32,
+    JitScalarType.COMPLEX64: torch.complex64,
+    JitScalarType.COMPLEX128: torch.complex128,
+    JitScalarType.QINT8: torch.qint8,
+    JitScalarType.QUINT8: torch.quint8,
+    JitScalarType.QINT32: torch.qint32,
+    JitScalarType.BFLOAT16: torch.bfloat16,
+    JitScalarType.FLOAT8E5M2: torch.float8_e5m2,
+    JitScalarType.FLOAT8E4M3FN: torch.float8_e4m3fn,
+    JitScalarType.FLOAT8E5M2FNUZ: torch.float8_e5m2fnuz,
+    JitScalarType.FLOAT8E4M3FNUZ: torch.float8_e4m3fnuz,
+}
+
+_DTYPE_TO_SCALAR_TYPE = {v: k for k, v in _SCALAR_TYPE_TO_DTYPE.items()}
diff --git a/.venv/Lib/site-packages/torch/onnx/errors.py b/.venv/Lib/site-packages/torch/onnx/errors.py
new file mode 100644
index 0000000000000000000000000000000000000000..153f1f281defd1aa20262b4f5bde768ad02563cd
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/errors.py
@@ -0,0 +1,103 @@
+"""ONNX exporter exceptions."""
+
+from __future__ import annotations
+
+
+__all__ = [
+    "OnnxExporterWarning",
+    "SymbolicValueError",
+    "UnsupportedOperatorError",
+]
+
+import textwrap
+from typing import TYPE_CHECKING
+
+
+if TYPE_CHECKING:
+    from torch import _C
+
+
+class OnnxExporterWarning(UserWarning):
+    """Warnings in the ONNX exporter."""
+
+
+class OnnxExporterError(RuntimeError):
+    """Errors raised by the ONNX exporter. This is the base class for all exporter errors."""
+
+
+class UnsupportedOperatorError(OnnxExporterError):
+    """Raised when an operator is unsupported by the exporter."""
+
+    # NOTE: This is legacy and is only used by the torchscript exporter
+    # Clean up when the torchscript exporter is removed
+    def __init__(self, name: str, version: int, supported_version: int | None):
+        from torch.onnx import _constants
+        from torch.onnx._internal import diagnostics
+
+        if supported_version is not None:
+            diagnostic_rule: diagnostics.infra.Rule = (
+                diagnostics.rules.operator_supported_in_newer_opset_version
+            )
+            msg = diagnostic_rule.format_message(name, version, supported_version)
+            diagnostics.diagnose(diagnostic_rule, diagnostics.levels.ERROR, msg)
+        else:
+            if name.startswith(("aten::", "prim::", "quantized::")):
+                diagnostic_rule = diagnostics.rules.missing_standard_symbolic_function
+                msg = diagnostic_rule.format_message(
+                    name, version, _constants.PYTORCH_GITHUB_ISSUES_URL
+                )
+                diagnostics.diagnose(diagnostic_rule, diagnostics.levels.ERROR, msg)
+            else:
+                diagnostic_rule = diagnostics.rules.missing_custom_symbolic_function
+                msg = diagnostic_rule.format_message(name)
+                diagnostics.diagnose(diagnostic_rule, diagnostics.levels.ERROR, msg)
+        super().__init__(msg)
+
+
+class SymbolicValueError(OnnxExporterError):
+    """Errors around TorchScript values and nodes."""
+
+    # NOTE: This is legacy and is only used by the torchscript exporter
+    # Clean up when the torchscript exporter is removed
+    def __init__(self, msg: str, value: _C.Value):
+        message = (
+            f"{msg}  [Caused by the value '{value}' (type '{value.type()}') in the "
+            f"TorchScript graph. The containing node has kind '{value.node().kind()}'.] "
+        )
+
+        code_location = value.node().sourceRange()
+        if code_location:
+            message += f"\n    (node defined in {code_location})"
+
+        try:
+            # Add its input and output to the message.
+            message += "\n\n"
+            message += textwrap.indent(
+                (
+                    "Inputs:\n"
+                    + (
+                        "\n".join(
+                            f"    #{i}: {input_}  (type '{input_.type()}')"
+                            for i, input_ in enumerate(value.node().inputs())
+                        )
+                        or "    Empty"
+                    )
+                    + "\n"
+                    + "Outputs:\n"
+                    + (
+                        "\n".join(
+                            f"    #{i}: {output}  (type '{output.type()}')"
+                            for i, output in enumerate(value.node().outputs())
+                        )
+                        or "    Empty"
+                    )
+                ),
+                "    ",
+            )
+        except AttributeError:
+            message += (
+                " Failed to obtain its input and output for debugging. "
+                "Please refer to the TorchScript graph for debugging information."
+            )
+
+        super().__init__(message)
diff --git a/.venv/Lib/site-packages/torch/onnx/operators.py b/.venv/Lib/site-packages/torch/onnx/operators.py
new file mode 100644
index 0000000000000000000000000000000000000000..88d52e0f3e4cda973a95e95d2425aa681311abb0
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/operators.py
@@ -0,0 +1,47 @@
+# mypy: allow-untyped-defs
+r"""This file provides a location for operators that help exporting models via onnx.
+
+E.g. `shape_as_tensor` and `reshape_from_tensor_shape`
+are to make all dynamic sizes operations traceable.
+
+NOTE: at one point these functions were implemented differently.
+Since then we have implemented these directly in ATen, so this
+file is kept purely for backward-compatibility.
+"""
+
+import torch
+import torch.onnx
+
+
+def shape_as_tensor(x):
+    """Get the shape of a tensor as a tensor.
+
+    Args:
+        x (Tensor): The input tensor.
+
+    Returns:
+        Tensor: A tensor of shape [len(x.shape)] containing the size of each dimension of x.
+
+    Example:
+        >>> x = torch.randn(2, 3)
+        >>> shape_as_tensor(x)
+        tensor([2, 3])
+
+    """
+    return torch._shape_as_tensor(x)
+
+
+def reshape_from_tensor_shape(x, shape):
+    """Reshape a tensor to the given shape.
+
+    This function is used to make dynamic size operations traceable when exporting models via ONNX.
+    This function is kept for backward-compatibility. It is implemented directly in ATen.
+
+    Parameters:
+        x (Tensor): the tensor to be reshaped.
+        shape (Tensor): the target shape.
+
+    Returns:
+        Tensor: the reshaped tensor.
+    """
+    return torch._reshape_from_tensor(x, shape)
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_caffe2.py b/.venv/Lib/site-packages/torch/onnx/symbolic_caffe2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a28cf9dd2fc76a1ddedc9885fd4ad611c5e024fa
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_caffe2.py
@@ -0,0 +1,361 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+import importlib
+import inspect
+
+from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import jit_utils, registration
+
+
+def register_quantized_ops(domain: str, version: int):
+    # Register all quantized ops
+    module = importlib.import_module("torch.onnx.symbolic_caffe2")
+    quant_version_ops = inspect.getmembers(module)
+    aten_q_ops = {
+        "relu",
+        "_empty_affine_quantized",
+        "dequantize",
+        "quantize_per_tensor",
+        "upsample_nearest2d",
+        "avg_pool2d",
+        "reshape",
+        "slice",
+        "cat",
+        "max_pool2d",
+        "sigmoid",
+    }
+    for op, func in quant_version_ops:
+        name = f"{domain}::{op}"
+        if inspect.isfunction(func) and not registration.registry.is_registered_op(
+            name, version
+        ):
+            if op in aten_q_ops:
+                # Override the builtin aten ops
+                registration.registry.register(
+                    f"aten::{op}", version, func, custom=True
+                )
+            registration.registry.register(name, version, func)
+
+
+def _permute_helper(g: jit_utils.GraphContext, input, axes):
+    quant_args = {
+        "axes_i": axes,
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    output = g.op("_caffe2::Int8Transpose", input, **quant_args)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+def nchw2nhwc(g: jit_utils.GraphContext, input):
+    axes = [0, 2, 3, 1]
+    return _permute_helper(g, input, axes)
+
+
+def nhwc2nchw(g: jit_utils.GraphContext, input):
+    axes = [0, 3, 1, 2]
+    return _permute_helper(g, input, axes)
+
+
+def linear_prepack(g: jit_utils.GraphContext, weight, bias):
+    # Mapping to a dummy caffe2 prepack node.
+    # During the onnx -> c2 conversion we can look up original weight and bias
+    # from this node
+    output = g.op("_caffe2::WeightPrepack", weight, bias)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "v", "f", "i")
+def linear(g: jit_utils.GraphContext, input, weight, bias, scale, zero_point):
+    kwargs = {
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8FC", input, weight, bias, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+def conv_prepack(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    # Mapping to a dummy caffe2 prepack node.
+    # During the onnx -> c2 conversion we can look up original weight and bias
+    # from this node
+    output = g.op("_caffe2::WeightPrepack", input, weight, bias)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "f", "i")
+def conv2d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    scale,
+    zero_point,
+):
+    kernel_size = weight.node()["shape"][1:3]
+    kwargs = {
+        "strides_i": stride,
+        "pads_i": padding + padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+        "kernels_i": kernel_size,
+        "order_s": "NHWC",
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8Conv", input, weight, bias, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "f", "i")
+def conv2d_relu(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    scale,
+    zero_point,
+):
+    kernel_size = weight.node()["shape"][1:3]
+    kwargs = {
+        "strides_i": stride,
+        "pads_i": padding + padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+        "kernels_i": kernel_size,
+        "order_s": "NHWC",
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8ConvRelu", input, weight, bias, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "f", "i")
+def add(g: jit_utils.GraphContext, input_a, input_b, scale, zero_point):
+    kwargs = {
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8Add", input_a, input_b, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v")
+def relu(g: jit_utils.GraphContext, input):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.relu(g, input)
+    kwargs = {
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    output = g.op("_caffe2::Int8Relu", input, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "f", "i", "t")
+def quantize_per_tensor(g: jit_utils.GraphContext, input, scale, zero_point, dtype):
+    kwargs = {
+        "Y_scale_f": scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8Quantize", input, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v")
+def dequantize(g: jit_utils.GraphContext, input):
+    return g.op("_caffe2::Int8Dequantize", input)
+
+
+@symbolic_helper.parse_args("v", "t", "t", "t", "t", "t", "t", "t")
+def _empty_affine_quantized(
+    g: jit_utils.GraphContext,
+    input,
+    shape,
+    scale,
+    zero_point,
+    dtype,
+    pin_memory,
+    memory_format,
+    layout,
+):
+    return input
+
+
+def upsample_nearest2d(
+    g: jit_utils.GraphContext,
+    input,
+    output_size,
+    align_corners=None,
+    scales_h=None,
+    scales_w=None,
+):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.upsample_nearest2d(g, input, output_size, align_corners)  # type: ignore[attr-defined]
+
+    output_size = symbolic_helper._parse_arg(output_size, "is")
+    kwargs = {
+        "output_size_i": output_size,
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    input = nchw2nhwc(g, input)
+    output = g.op("_caffe2::Int8ResizeNearest", input, **kwargs)
+    output = nhwc2nchw(g, output)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
+def max_pool2d(
+    g: jit_utils.GraphContext,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    dilation,
+    ceil_mode,
+):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.max_pool2d(  # type: ignore[attr-defined]
+            g, input, kernel_size, stride, padding, dilation, ceil_mode
+        )
+    kwargs = {
+        "strides_i": stride,
+        "pads_i": padding + padding,
+        "kernel_i": kernel_size[0],
+        "order_s": "NHWC",
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    input = nchw2nhwc(g, input)
+    output = g.op("_caffe2::Int8MaxPool", input, **kwargs)
+    output = nhwc2nchw(g, output)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
+def avg_pool2d(
+    g: jit_utils.GraphContext,
+    input,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    count_include_pad,
+    divisor_override=None,
+):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.avg_pool2d(  # type: ignore[attr-defined]
+            g,
+            input,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            count_include_pad,
+            divisor_override,
+        )
+    kwargs = {
+        "strides_i": stride,
+        "pads_i": padding + padding,
+        "kernel_i": kernel_size[0],
+        "order_s": "NHWC",
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    input = nchw2nhwc(g, input)
+    output = g.op("_caffe2::Int8AveragePool", input, **kwargs)
+    output = nhwc2nchw(g, output)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+def reshape(g: jit_utils.GraphContext, input, shape):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.reshape(g, input, shape)
+
+    kwargs = {
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    output = g.op("_caffe2::Int8Reshape", input, shape, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v", "v", "v", "v", "i")
+def slice(g: jit_utils.GraphContext, input, dim, start, end, step):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.slice(g, input, dim, start, end, step)
+
+    if step != 1:
+        raise RuntimeError("ONNX quantized slice export only works for step 1.")
+    start = symbolic_helper._parse_arg(start, "i")
+    end = symbolic_helper._parse_arg(end, "i")
+    dim = symbolic_helper._parse_arg(dim, "i")
+
+    kwargs = {
+        "start_idx_i": start,
+        "end_idx_i": end,
+        "dim_i": dim,
+        "Y_scale_f": symbolic_helper._node_get(input.node(), "Y_scale"),
+        "Y_zero_point_i": symbolic_helper._node_get(input.node(), "Y_zero_point"),
+    }
+    output = g.op("_caffe2::Int8Slice", input, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+def cat(g: jit_utils.GraphContext, tensor_list, dim, scale=None, zero_point=None):
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    input = tensors[0]
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.cat(g, tensor_list, dim)
+
+    dim = symbolic_helper._parse_arg(dim, "i")
+    kwargs = {
+        "Y_scale_f": tensors[0].node()["Y_scale"],
+        "Y_zero_point_i": tensors[0].node()["Y_zero_point"],
+    }
+    output = g.op("_caffe2::Int8Concat", *tensors, axis_i=dim, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
+
+
+@symbolic_helper.parse_args("v")
+def sigmoid(g: jit_utils.GraphContext, input):
+    if input not in symbolic_helper._quantized_ops:
+        return opset9.sigmoid(g, input)
+    # Caffe2 expects the output scale to be 1/2^8
+    # and output zero_point to be 0 (quint8 type)
+    out_scale = 1.0 / 256
+    zero_point = 0
+    kwargs = {
+        "Y_scale_f": out_scale,
+        "Y_zero_point_i": zero_point,
+    }
+    output = g.op("_caffe2::Int8Sigmoid", input, **kwargs)
+    symbolic_helper._quantized_ops.add(output)
+    return output
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_helper.py b/.venv/Lib/site-packages/torch/onnx/symbolic_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..29c176ad9565d53e12504b650679fdc6464bd927
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_helper.py
@@ -0,0 +1,2261 @@
+# mypy: allow-untyped-defs
+from __future__ import annotations
+
+import functools
+import inspect
+import math
+import sys
+import typing
+import warnings
+from typing import Any, Callable, Literal, NoReturn, Sequence, TypeVar as _TypeVar
+from typing_extensions import Concatenate as _Concatenate, ParamSpec as _ParamSpec
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch import _C
+
+# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
+from torch.onnx import _constants, _type_utils, errors, utils
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import jit_utils
+
+
+if typing.TYPE_CHECKING:
+    from torch.types import Number
+
+_T = _TypeVar("_T")
+_U = _TypeVar("_U")
+_P = _ParamSpec("_P")
+
+# ---------------------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------------------
+
+_ValueDescriptor = Literal[
+    "v",
+    "i",
+    "is",
+    "f",
+    "fs",
+    "b",
+    "s",
+    "t",
+    "none",
+]
+
+
+def _parse_arg(
+    value,
+    desc: _ValueDescriptor,
+    arg_name: str | None = None,
+    node_name: str | None = None,
+):
+    if desc == "none":
+        return value
+    if desc == "v" or not _is_value(value):
+        return value
+
+    node = value.node()
+    if node.mustBeNone():
+        return None
+    if node.kind() == "onnx::Constant":
+        node_val = _node_get(node, "value")
+        if desc == "i":
+            return int(node_val)
+        elif desc == "f":
+            return float(node_val)
+        elif desc == "b":
+            return bool(node_val)
+        elif desc == "s":
+            return str(node_val)
+        elif desc == "t":
+            return node_val
+        elif desc == "is":
+            return [int(v) for v in node_val]
+        elif desc == "fs":
+            return [float(v) for v in node_val]
+        else:
+            raise errors.SymbolicValueError(
+                f"ONNX symbolic does not understand the Constant node '{node}' "
+                f"specified with descriptor '{desc}'.",
+                value,
+            )
+    elif node.kind() == "prim::ListConstruct":
+        if desc == "is":
+            for v in node.inputs():
+                element_node = v.node()
+                if element_node.kind() != "onnx::Constant":
+                    raise errors.SymbolicValueError(
+                        f"Failed to export a node '{element_node}' "
+                        f"(in list node {node}) "
+                        f"because it is not constant. "
+                        f"Please try to make things (e.g. kernel sizes) static if possible.",
+                        value,
+                    )
+            return [int(_node_get(v.node(), "value")) for v in value.node().inputs()]
+        else:
+            raise errors.SymbolicValueError(
+                f"ONNX symbolic does not know how to unpack the ListConstruct node that "
+                f"is not a list of integers: '{node}'",
+                value,
+            )
+
+    if arg_name is None or node_name is None:
+        raise errors.SymbolicValueError(
+            f"Expected node type 'onnx::Constant', got '{node.kind()}'.",
+            value,
+        )
+
+    raise errors.SymbolicValueError(
+        "Expected node type 'onnx::Constant' "
+        f"for argument '{arg_name}' of node '{node_name}', got '{node.kind()}'.",
+        value,
+    )
+
+
+def _node_get(node: _C.Node, key: str):
+    """Gets attributes of a node which is polymorphic over return type."""
+    assert isinstance(node, _C.Node)
+    sel = node.kindOf(key)
+    return getattr(node, sel)(key)
+
+
+def _is_onnx_constant(value: _C.Value):
+    """Whether a Value is an ONNX constant."""
+    return value.node().kind() == "onnx::Constant"
+
+
+def _maybe_get_const(
+    value: _C.Value | torch.Tensor | Number | Sequence | None,
+    descriptor: _ValueDescriptor,
+):
+    # NOTE: prim::Constant at this stage usually means something not compatible in ONNX,
+    # otherwise it'd be converted to onnx::Constant
+    # TODO(justinchuby): Replace insinstance with _is_value once we figure out mypy
+    if isinstance(value, _C.Value) and _is_onnx_constant(value):
+        return _parse_arg(value, descriptor)
+    return value
+
+
+def _maybe_get_scalar(value):
+    value_t = _maybe_get_const(value, "t")
+    if isinstance(value_t, torch.Tensor) and value_t.shape == ():
+        return value_t
+    return value
+
+
+def _get_const(value, desc, arg_name):
+    if not _is_constant(value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected a constant value of the '{arg_name}' argument, "
+            f"got '{value}'",
+            value,
+        )
+    return _parse_arg(value, desc)
+
+
+def _unpack_list(list_value: _C.Value) -> list[_C.Value]:
+    list_node = list_value.node()
+    if list_node.kind() != "prim::ListConstruct":
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected node type prim::ListConstruct, "
+            f"got '{list_node}'.",
+            list_value,
+        )
+    return list(list_node.inputs())
+
+
+def _unpack_tuple(tuple_value: _C.Value) -> tuple[_C.Value, ...]:
+    tuple_node = tuple_value.node()
+    if not _is_tuple_construct(tuple_value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected node type 'prim::TupleConstruct', "
+            f"got '{tuple_node.kind()}'.",
+            tuple_value,
+        )
+    return tuple(tuple_node.inputs())
+
+
+def _unpack_quantized_tensor(tuple_value: _C.Value) -> tuple[_C.Value, ...]:
+    """Unpacks a quantized tensor into a tuple of tensor and scale/zero_point.
+    Args:
+        tuple_value: A tuple of tensor, scale, zero_point, and optionally axis.
+    Returns:
+        A tuple of tensor, scale, zero_point, and optionally axis.
+    """
+    tuple_node = tuple_value.node()
+    # A quantized tensor is represented as tuple of the form (tensor, scale, zero_point, <axis>)
+    if not _is_tuple_construct(tuple_value):
+        raise errors.SymbolicValueError(
+            f"ONNX symbolic expected the output of `{tuple_node}` to be a quantized "
+            f"tensor. Is this likely due to missing support for quantized "
+            f"`{tuple_node.kind()}`. Please create an issue on {_constants.PYTORCH_GITHUB_ISSUES_URL}",
+            tuple_value,
+        )
+    unpacked = tuple(tuple_node.inputs())
+    assert len(unpacked) == 3 or len(unpacked) == 4
+    return unpacked
+
+
+# Check if list_value is output from prim::ListConstruct
+# This is usually called before _unpack_list to ensure the list can be unpacked.
+def _is_packed_list(list_value: Any) -> bool:
+    return _is_value(list_value) and list_value.node().kind() == "prim::ListConstruct"
+
+
+def parse_args(
+    *arg_descriptors: _ValueDescriptor,
+) -> Callable[[Callable[_Concatenate[_U, _P], _T]], Callable[_Concatenate[_U, _P], _T]]:
+    """A decorator which converts args from torch._C.Value to built-in types.
+
+    For example:
+
+    ```
+    @parse_args('v', 'i', 'fs')
+    foo(g, a, b, c):
+        assert isinstance(a, torch._C.Value)
+        assert isinstance(b, int)
+        assert isinstance(c, list)
+        assert isinstance(c[0], float)
+    ```
+
+    Args:
+        arg_descriptors: list of str, where each element is
+            a string that specifies the type to convert to. Valid descriptors:
+            "v": no conversion, keep torch._C.Value.
+            "i": int
+            "is": list of int
+            "f": float
+            "fs": list of float
+            "b": bool
+            "s": str
+            "t": torch.Tensor
+            "none": the variable is unused
+    """
+
+    def decorator(
+        fn: Callable[_Concatenate[_U, _P], _T],
+    ) -> Callable[_Concatenate[_U, _P], _T]:
+        fn._arg_descriptors = arg_descriptors  # type: ignore[attr-defined]
+
+        @functools.wraps(fn)
+        def wrapper(g: _U, *args: _P.args, **kwargs: _P.kwargs) -> _T:
+            # some args may be optional, so the length may be smaller
+            FILE_BUG_MSG = (
+                "If you believe this is not due to custom symbolic implementation within your code or "
+                "an external library, please file an issue at "
+                "https://github.com/pytorch/pytorch/issues/new?template=bug-report.yml to report this bug."
+            )
+            assert len(arg_descriptors) >= len(args), (
+                f"A mismatch between the number of arguments ({len(args)}) and "
+                f"their descriptors ({len(arg_descriptors)}) was found at symbolic function '{fn.__name__}'. "
+                f"{FILE_BUG_MSG}"
+            )
+
+            try:
+                sig = inspect.signature(fn)
+                arg_names = list(sig.parameters.keys())[1:]
+                fn_name = fn.__name__
+            except Exception:
+                # FIXME(justinchuby): Avoid catching Exception.
+                # Catch a more specific exception instead.
+                arg_names = [None] * len(args)  # type: ignore[list-item]
+                fn_name = None
+            args = [
+                _parse_arg(arg, arg_desc, arg_name, fn_name)  # type: ignore[method-assign]
+                for arg, arg_desc, arg_name in zip(args, arg_descriptors, arg_names)
+            ]
+            # only support _outputs in kwargs
+            assert len(kwargs) <= 1, (
+                f"Symbolic function {fn.__name__}'s '**kwargs' can contain a single "
+                f"key/value entry. "
+                f"{FILE_BUG_MSG}"
+            )
+
+            if len(kwargs) == 1:
+                assert "_outputs" in kwargs, (
+                    f"Symbolic function {fn.__name__}'s '**kwargs' can only contain "
+                    f"'_outputs' key at '**kwargs'. "
+                    f"{FILE_BUG_MSG}"
+                )
+            return fn(g, *args, **kwargs)
+
+        return wrapper
+
+    return decorator
+
+
+def quantized_args(
+    *arg_q_descriptors: bool,
+    scale: float | None = None,
+    zero_point: int | None = None,
+    quantize_output: bool = True,
+) -> Callable[[Callable[_P, _T]], Callable[_P, _T]]:
+    """A decorator which extends support for quantized version of the base operator.
+
+    Quantization is detected by examining the arguments that are annotated by
+    `arg_q_descriptors`.
+
+    If quantization is detected, the base operator symbolic function will be wrapped with
+    argument de-quantization and output quantization.
+
+    Otherwise, only the base symbolic function will be invoked.
+
+    For example:
+
+    ```
+    @quantized_args(True, False)
+    def foo(g, x, y):
+        return x + y
+    ```
+
+    is equivalent to
+
+    ```
+    def q_foo(g, x, y):
+        if is_quantized_tensor(x):
+            x = dequantize(x)
+            out = foo(g, x, y)
+            return quantize(out)
+        else:
+            return foo(g, x, y)
+    ```
+
+    Args:
+        arg_q_descriptors: A sequence of bool, where each element represents if the
+          argument is QTensor for quantized version of this operator. It defaults
+          to False for unspecified (variable length) arguments.
+        scale: Quantized output scale. If None, derive from
+          the first quantized input scale.
+        zero_point: Quantized output zero point. If None,
+          derive from the first quantized input zero point.
+        quantize_output: If True, quantize the output of the base operator. Default is True
+    """
+
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrapper(g, *args, **kwargs):
+            nonlocal scale
+            nonlocal zero_point
+            if scale is not None:
+                _scale = g.op("Constant", value_t=torch.tensor(scale))
+            else:
+                _scale = None
+            if zero_point is not None:
+                _zero_point = g.op("Constant", value_t=torch.tensor(zero_point))
+            else:
+                _zero_point = None
+
+            # Support variable length arguments by marking unspecified ones as non-quantized
+            arg_q_descriptors_extended = arg_q_descriptors + (False,) * (
+                len(args) - len(arg_q_descriptors)
+            )
+            descriptor_args = tuple(zip(arg_q_descriptors_extended, args))
+
+            def _is_arg_quantized(descriptor, arg):
+                return descriptor and _is_value(arg) and _is_tuple_construct(arg)
+
+            # Run regular symbolic function if none of the argument is QTensor.
+            is_quantized = []
+            for descriptor, arg in descriptor_args:
+                # ListConstruct
+                if _is_packed_list(arg):
+                    for arg_input in arg.node().inputs():
+                        is_quantized.append(_is_arg_quantized(descriptor, arg_input))
+                else:
+                    is_quantized.append(_is_arg_quantized(descriptor, arg))
+
+            if not any(is_quantized):
+                return fn(g, *args, **kwargs)
+
+            # Dequantize arguments that are quantized
+            non_quantized_args = []
+            for descriptor, arg in descriptor_args:
+                if _is_arg_quantized(descriptor, arg):
+                    # Quantized arg is a tuple of (value, scale, zero_point)
+                    dequantized_arg, arg_scale, arg_zero_point, _ = dequantize_helper(
+                        g, arg
+                    )
+                    non_quantized_args.append(dequantized_arg)
+                    # Set scale and zero_point to the first quantized input if not already set
+                    if _scale is None:
+                        _scale = arg_scale
+                    if _zero_point is None:
+                        _zero_point = arg_zero_point
+                # ListConstruct
+                elif _is_packed_list(arg):
+                    for arg_input in arg.node().inputs():
+                        if _is_arg_quantized(descriptor, arg_input):
+                            # Quantized arg is a tuple of (value, scale, zero_point)
+                            (
+                                dequantized_arg,
+                                arg_scale,
+                                arg_zero_point,
+                                _,
+                            ) = dequantize_helper(g, arg_input)
+                            # Set scale and zero_point to the first quantized input if not already set
+                            if _scale is None:
+                                _scale = arg_scale
+                            if _zero_point is None:
+                                _zero_point = arg_zero_point
+                            arg_input.replaceAllUsesWith(dequantized_arg)
+                    non_quantized_args.append(arg)
+                else:
+                    # Non-quantized arg
+                    non_quantized_args.append(arg)
+            # TODO(justinchuby): Only single output is supported for now. We may want to
+            # support multiple outputs in the future.
+            output = fn(g, *non_quantized_args, **kwargs)
+
+            assert _scale is not None, "Bug: Scale must be set for quantized operator"
+            assert (
+                _zero_point is not None
+            ), "Bug: Zero point must be set for quantized operator"
+
+            if quantize_output:
+                return quantize_helper(g, output, _scale, _zero_point)
+            return output
+
+        return wrapper
+
+    return decorator
+
+
+def _scalar(x: Any) -> Number | None:
+    """Convert a scalar tensor into a Python value."""
+    if isinstance(x, torch.Tensor) and x.shape == ():
+        return x.item()
+    return None
+
+
+def _if_scalar_type_as(self, tensor):
+    """
+    Convert self into the same type of tensor, as necessary.
+    We only support implicit casting for scalars, so we never
+    actually need to insert an ONNX cast operator here; just
+    fix up the scalar.
+    """
+    if isinstance(self, _C.Value):
+        return self
+
+    scalar_type = _type_utils.JitScalarType.from_value(
+        tensor, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        ty = scalar_type.scalar_name().lower()
+        return getattr(self, ty)()
+    return self
+
+
+def _is_none(x: Any) -> bool:
+    return x is None or (x.node().mustBeNone() if isinstance(x, _C.Value) else False)
+
+
+def _is_value(x: Any) -> bool:
+    return isinstance(x, _C.Value)
+
+
+def _is_constant(value: Any) -> bool:
+    return not _is_value(value) or value.node().kind() in {
+        "onnx::Constant",
+        "prim::Constant",
+    }
+
+
+def _is_tensor(x: _C.Value) -> bool:
+    return x.type().isSubtypeOf(_C.TensorType.get())
+
+
+# Note: _C.JitType is not exposed to Python and cannot be checked in runtime.
+def _as_list_type(jit_type: _C.JitType) -> _C.ListType | None:
+    if isinstance(jit_type, _C.ListType):
+        return jit_type
+    return None
+
+
+def _is_list(x: _C.Value) -> bool:
+    return _as_list_type(x.type()) is not None
+
+
+def _is_tensor_list(x: _C.Value) -> bool:
+    x_type = _as_list_type(x.type())
+    if x_type is None:
+        return False
+    return isinstance(x_type.getElementType(), _C.TensorType)
+
+
+def _is_scalar_list(x: _C.Value) -> bool:
+    """Checks if x is a scalar list, for example: List[float], List[int].
+
+    Besides checking the type is ListType, we also check if the data type is
+    a valid ONNX data type.
+    """
+    x_type = _as_list_type(x.type())
+    if x_type is None:
+        return False
+    scalar_type = _type_utils.JitScalarType.from_value(x)
+    return scalar_type.onnx_compatible()
+
+
+def _is_tuple_construct(x: _C.Value) -> bool:
+    return x.node().kind() == "prim::TupleConstruct"
+
+
+def is_complex_value(x: _C.Value) -> bool:
+    assert _is_value(x)
+    return _type_utils.JitScalarType.from_value(
+        x, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.COMPLEX32,
+        _type_utils.JitScalarType.COMPLEX64,
+        _type_utils.JitScalarType.COMPLEX128,
+    }
+
+
+def _get_tensor_rank(x: _C.Value) -> int | None:
+    if not _is_tensor(x) or x.type() is None:
+        return None
+    x_type = x.type()
+    x_type = typing.cast(_C.TensorType, x_type)
+    return x_type.dim()
+
+
+def _get_tensor_sizes(x: _C.Value, allow_nonstatic: bool = True):
+    if not _is_tensor(x) or x.type() is None:
+        return None
+    x_type = x.type()
+    x_type = typing.cast(_C.TensorType, x_type)
+    if allow_nonstatic:
+        # Each individual symbol is returned as None.
+        # e.g. [1, "a", "b"] -> [1, None, None]
+        return x_type.varyingSizes()
+    # returns None, if exists any symbol in sizes.
+    # e.g. [1, "a", "b"] -> None
+    return x_type.sizes()
+
+
+def _get_tensor_dim_size(x: _C.Value, dim: int) -> int | None:
+    sizes = _get_tensor_sizes(x)
+    return sizes[dim] if sizes else None
+
+
+def _get_dim_for_cross(x: _C.Value, dim: int | None):
+    if dim == -1:
+        tensor_rank = _get_tensor_rank(x)
+        assert tensor_rank is not None
+        return dim + tensor_rank
+    # If dim is not given, it defaults to the first dimension found with the size 3
+    if dim is None:
+        sizes = _get_tensor_sizes(x)
+        assert sizes is not None
+        for index, size in enumerate(sizes):
+            if size is not None and size == 3:
+                return index
+    return dim
+
+
+def _unimplemented(op: str, msg: str, value: _C.Value | None = None) -> None:
+    # For BC reasons, the behavior for Caffe2 does not raise exception for unimplemented operators
+    if GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
+        _onnx_unsupported(f"{op}, {msg}", value)
+
+
+def _onnx_unsupported(op_name: str, value: _C.Value | None = None) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of operator {op_name}. "
+        f"Please feel free to request support or submit a pull request "
+        f"on PyTorch GitHub: {_constants.PYTORCH_GITHUB_ISSUES_URL}"
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+def _onnx_opset_unsupported(
+    op_name: str,
+    current_opset: int,
+    supported_opset: int,
+    value: _C.Value | None = None,
+) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of {op_name} in opset {current_opset}. "
+        f"Please try opset version {supported_opset}."
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+def _onnx_opset_unsupported_detailed(
+    op_name: str,
+    current_opset: int,
+    supported_opset: int,
+    reason: str,
+    value: _C.Value | None = None,
+) -> NoReturn:
+    message = (
+        f"Unsupported: ONNX export of {op_name} in "
+        f"opset {current_opset}. {reason}. Please try opset version {supported_opset}."
+    )
+    if isinstance(value, _C.Value):
+        raise errors.SymbolicValueError(
+            message,
+            value,
+        )
+    raise errors.OnnxExporterError(message)
+
+
+def _block_list_in_opset(name: str):
+    def symbolic_fn(*args, **kwargs):
+        raise errors.OnnxExporterError(
+            f"ONNX export failed on {name}, which is not implemented for opset "
+            f"{GLOBALS.export_onnx_opset_version}. "
+            "Try exporting with other opset versions."
+        )
+
+    return symbolic_fn
+
+
+def _try_get_scalar_type(*args) -> _type_utils.JitScalarType | None:
+    for arg in args:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            arg, _type_utils.JitScalarType.UNDEFINED
+        )
+        if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+            return scalar_type
+    return None
+
+
+def _type_promote_from_values(*args) -> _type_utils.JitScalarType:
+    undef = _type_utils.JitScalarType.UNDEFINED
+    jit_types = [_try_get_scalar_type(arg) for arg in args]
+    if len(jit_types) == 0:
+        return undef
+    if len(jit_types) == 1:
+        return jit_types[0]  # type: ignore[return-value]
+    new_dtype = jit_types[0].dtype()  # type: ignore[union-attr]
+    for t in jit_types:
+        new_dtype = torch.promote_types(new_dtype, t.dtype())  # type: ignore[union-attr]
+    return _type_utils.JitScalarType.from_dtype(new_dtype)
+
+
+def _maybe_cast_to_type(
+    g: jit_utils.GraphContext, value, jit_type: _type_utils.JitScalarType
+):
+    if (
+        _type_utils.JitScalarType.from_value(value, _type_utils.JitScalarType.UNDEFINED)
+        != jit_type
+    ):
+        return g.op(
+            "Cast",
+            value,
+            to_i=jit_type.onnx_type(),
+        )
+    return value
+
+
+def _select_helper(g: jit_utils.GraphContext, self, dim, index, apply_reshape=True):
+    index_const = _maybe_get_scalar(index)
+    index_dim = _get_tensor_rank(index)
+    if not _is_value(index_const):
+        # Index is a constant scalar. Make it a size 1 constant tensor.
+        index = g.op("Constant", value_t=torch.LongTensor([index_const]))
+    elif index_dim is not None and apply_reshape:
+        if index_dim == 0:
+            # Index is a scalar. Reshape it to a size 1 tensor.
+            index = _reshape_helper(
+                g, index, g.op("Constant", value_t=torch.LongTensor([1]))
+            )
+
+    index_scalar_type = _type_utils.JitScalarType.from_value(
+        index, _type_utils.JitScalarType.UNDEFINED
+    )
+    if index_scalar_type not in {
+        _type_utils.JitScalarType.INT64,
+        _type_utils.JitScalarType.INT,
+    }:
+        index = g.op("Cast", index, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("Gather", self, index, axis_i=dim)
+
+
+def _slice_helper(
+    g: jit_utils.GraphContext,
+    input,
+    axes,
+    starts,
+    ends,
+    steps=None,
+):
+    if g.opset <= 9:
+        from torch.onnx.symbolic_opset9 import _slice as _slice9
+
+        return _slice9(g, input, axes, starts, ends)
+    else:
+        from torch.onnx.symbolic_opset10 import _slice as _slice10
+
+        return _slice10(g, input, axes, starts, ends, steps)
+
+
+def _is_fp(value) -> bool:
+    return _type_utils.JitScalarType.from_value(
+        value, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.DOUBLE,
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.BFLOAT16,
+    }
+
+
+def _is_bool(value) -> bool:
+    return _type_utils.JitScalarType.from_value(
+        value, _type_utils.JitScalarType.UNDEFINED
+    ) in {_type_utils.JitScalarType.BOOL}
+
+
+def _generate_wrapped_number(g: jit_utils.GraphContext, scalar):
+    """Creates a wrapped number based on https://github.com/pytorch/pytorch/issues/9515.
+
+    A Tensor is a considered a "wrapped number" if it is
+    auto-wrapped from a C++ or Python number type. Integer types are
+    wrapped as 0-dim int64 tensors and floating-point types are
+    wrapped as 0-dim double tensors.
+
+    The input to this function is constant value. If the data type
+    is a floating point type, it is converted to a 0-dim double
+    tensor, else it is converted to a 0-dim tensor of its original type
+    """
+    assert not isinstance(scalar, torch.Tensor)
+    if isinstance(scalar, float):
+        return g.op("Constant", value_t=torch.tensor(scalar, dtype=torch.double))
+    return g.op("Constant", value_t=torch.tensor(scalar))
+
+
+def _sort_helper(g: jit_utils.GraphContext, input, dim, decending=True, out=None):
+    if out is not None:
+        _unimplemented("Sort", "Out parameter is not supported")
+    shape_ = g.op("Shape", input)
+    dim_size_ = g.op(
+        "Gather",
+        shape_,
+        g.op("Constant", value_t=torch.tensor([dim], dtype=torch.int64)),
+    )
+    if g.opset <= 10:
+        if not decending:
+            _unimplemented("Sort", "Ascending is not supported")
+        return g.op("TopK", input, dim_size_, axis_i=dim, outputs=2)
+    else:
+        return g.op(
+            "TopK", input, dim_size_, axis_i=dim, largest_i=decending, outputs=2
+        )
+
+
+def _topk_helper(
+    g: jit_utils.GraphContext, input, k, dim, largest=True, sorted=False, out=None
+):
+    if out is not None:
+        _unimplemented("TopK", "Out parameter is not supported")
+    if not _is_value(k):
+        k = g.op("Constant", value_t=torch.tensor([k], dtype=torch.int64))
+    else:
+        k = _reshape_helper(g, k, g.op("Constant", value_t=torch.tensor([1])))
+        if _try_get_scalar_type(k) != _type_utils.JitScalarType.INT64:
+            k = g.op("Cast", k, to_i=_C_onnx.TensorProtoDataType.INT64)
+    if g.opset <= 10:
+        if not largest:
+            _unimplemented("TopK", "Ascending is not supported")
+        return g.op("TopK", input, k, axis_i=dim, outputs=2)
+    else:
+        return g.op(
+            "TopK", input, k, axis_i=dim, largest_i=largest, sorted_i=sorted, outputs=2
+        )
+
+
+def _lt_helper(g: jit_utils.GraphContext, input, other):
+    if g.opset <= 8:
+        from torch.onnx.symbolic_opset8 import lt as _lt8
+
+        return _lt8(g, input, other)
+    else:
+        from torch.onnx.symbolic_opset9 import lt as _lt9
+
+        return _lt9(g, input, other)
+
+
+def _interpolate_warning(interpolate_mode):
+    onnx_op = (
+        "onnx:Resize" if GLOBALS.export_onnx_opset_version >= 10 else "onnx:Upsample"
+    )
+    warnings.warn(
+        "You are trying to export the model with "
+        + onnx_op
+        + " for ONNX opset version "
+        "" + str(GLOBALS.export_onnx_opset_version) + ". "
+        "This operator might cause results to not match the expected results by PyTorch.\n"
+        "ONNX's Upsample/Resize operator did not match Pytorch's Interpolation until opset 11. "
+        "Attributes to determine how to transform the input were added in onnx:Resize in opset 11 "
+        "to support Pytorch's behavior (like coordinate_transformation_mode and nearest_mode).\n"
+        "We recommend using opset 11 and above for models using this operator."
+    )
+
+
+def _unsqueeze_helper(g: jit_utils.GraphContext, input, axes_i):
+    if _is_constant(axes_i[0]):
+        if g.opset >= 13:
+            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+            return g.op("Unsqueeze", input, axes)
+        return g.op("Unsqueeze", input, axes_i=axes_i)
+    # Tensor type
+    if g.opset < 13:
+        raise errors.SymbolicValueError(
+            "Opset version must be >= 13 for Unsqueeze with dynamic axes.", input
+        )
+    return g.op("Unsqueeze", input, axes_i[0])
+
+
+def _squeeze_helper(g: jit_utils.GraphContext, input, axes_i):
+    if _is_constant(axes_i[0]):
+        if g.opset >= 13:
+            axes = g.op("Constant", value_t=torch.tensor(axes_i, dtype=torch.long))
+            return g.op("Squeeze", input, axes)
+        return g.op("Squeeze", input, axes_i=axes_i)
+    # Tensor type
+    if g.opset < 13:
+        raise errors.SymbolicValueError(
+            "Opset version must be >= 13 for Squeeze with dynamic axes.", input
+        )
+    axes_t = axes_i[0]
+    axes_rank = _get_tensor_rank(axes_t)
+    assert axes_rank is not None
+    if axes_rank > 1:
+        raise errors.SymbolicValueError(
+            "For Squeeze axses as input, the axes rank must be one in ONNX spec.", input
+        )
+    elif axes_rank == 0:
+        # The axes is a scalar. Unsqueeze it to a rank 1 tensor.
+        axes_t = _unsqueeze_helper(g, axes_t, [0])
+        return g.op("Squeeze", input, axes_t)
+    return g.op("Squeeze", input, axes_t)
+
+
+def _reducesum_helper(
+    g: jit_utils.GraphContext,
+    input,
+    axes_i=None,
+    keepdims_i=1,
+    noop_with_empty_axes_i=0,
+):
+    keepdims_i = _maybe_get_const(keepdims_i, "i")
+    if g.opset >= 13:
+        if axes_i:
+            if not _is_value(axes_i):
+                axes_i = g.op(
+                    "Constant", value_t=torch.tensor(axes_i, dtype=torch.long)
+                )
+            return g.op(
+                "ReduceSum",
+                input,
+                axes_i,
+                keepdims_i=keepdims_i,
+                noop_with_empty_axes_i=noop_with_empty_axes_i,
+            )
+        return g.op(
+            "ReduceSum",
+            input,
+            keepdims_i=keepdims_i,
+            noop_with_empty_axes_i=noop_with_empty_axes_i,
+        )
+    else:
+        return g.op("ReduceSum", input, axes_i=axes_i, keepdims_i=keepdims_i)
+
+
+def _interpolate_size_to_scales(g: jit_utils.GraphContext, input, output_size, dim):
+    output_size = _maybe_get_const(output_size, "is")
+    if _is_value(output_size):
+        offset = 2
+        offsets = g.op("Constant", value_t=torch.ones(offset, dtype=torch.float32))
+        dividend = g.op("Cast", output_size, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        divisor = _slice_helper(
+            g, g.op("Shape", input), axes=[0], ends=[sys.maxsize], starts=[offset]
+        )
+        divisor = g.op("Cast", divisor, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        scale_dims = g.op("Div", dividend, divisor)
+        scales = g.op("Concat", offsets, scale_dims, axis_i=0)
+    else:
+        scales_constant = [
+            1.0
+            if i < 2
+            else float(output_size[-(dim - i)])
+            / float(input.type().sizes()[-(dim - i)])
+            for i in range(0, dim)
+        ]
+        scales = g.op(
+            "Constant", value_t=torch.tensor(scales_constant, dtype=torch.float32)
+        )
+    return scales
+
+
+def _interpolate_get_scales_if_available(g: jit_utils.GraphContext, scales):
+    available_scales = _maybe_get_const(scales[0], "fs") != -1 and not _is_none(
+        scales[0]
+    )
+
+    if not available_scales:
+        return None
+
+    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
+    scales_list = g.op(
+        "Constant", value_t=torch.tensor(_maybe_get_const(scales[0], "fs"))
+    )
+    scales = g.op("Concat", offsets, scales_list, axis_i=0)
+    return scales
+
+
+def _get_interpolate_attributes(g: jit_utils.GraphContext, mode, args):
+    if mode == "nearest":
+        align_corners = None
+        scales = args[0:]
+    else:
+        align_corners = args[0]
+        scales = args[1:]
+    scales = _interpolate_get_scales_if_available(g, scales)
+    return scales, align_corners
+
+
+def _interpolate_get_scales(g: jit_utils.GraphContext, scale_factor, dim):
+    offsets = g.op("Constant", value_t=torch.ones(2, dtype=torch.float32))
+    scale_factor_rank = _get_tensor_rank(scale_factor)
+    if isinstance(scale_factor.type(), _C.ListType) or (
+        scale_factor_rank is not None and scale_factor_rank > 0
+    ):
+        return g.op("Concat", offsets, scale_factor, axis_i=0)
+    else:
+        scale_factor = _unsqueeze_helper(g, scale_factor, [0])
+        scale_factor = g.op(
+            "Cast", scale_factor, to_i=_C_onnx.TensorProtoDataType.FLOAT
+        )
+        scales = [scale_factor for i in range(dim - 2)]
+    scale_factor = g.op("Concat", offsets, *scales, axis_i=0)
+    return scale_factor
+
+
+def _interpolate_get_scales_and_mode(
+    g: jit_utils.GraphContext, input, size, scale_factor, mode, align_corners
+):
+    mode = _maybe_get_const(mode, "s")
+    if "linear" in mode:
+        mode = "linear"
+    if "cubic" in mode:
+        mode = "cubic"
+    _interpolate_warning(mode)
+
+    align_corners = _maybe_get_const(align_corners, "b")
+    if isinstance(align_corners, bool) and align_corners:
+        return _unimplemented("interpolate", "align_corners == True")
+
+    if not input.type().dim():
+        return _unimplemented("interpolate", "missing input shape")
+    dim = input.type().dim()
+
+    if not _is_none(scale_factor):
+        scale_factor = _interpolate_get_scales(g, scale_factor, dim)
+    elif not _is_none(size):
+        if not _is_packed_list(size):
+            is_scalar = _maybe_get_const(size, "t").dim() == 0
+            if is_scalar:
+                size = _unsqueeze_helper(g, size, [0])
+                size = [size for i in range(dim - 2)]
+                size = g.op("Concat", *size, axis_i=0)
+        scale_factor = _interpolate_size_to_scales(g, input, size, dim)
+    else:
+        return _unimplemented(
+            "interpolate", "Both size and scales are None in __interpolate"
+        )
+    return scale_factor, mode
+
+
+def _argmin_argmax_helper(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+    op_name: str,
+):
+    def op_wrapper(input, axis_i, keepdims_i):
+        if g.opset >= 12:
+            return g.op(
+                op_name,
+                input,
+                axis_i=axis_i,
+                keepdims_i=keepdims_i,
+                select_last_index_i=False,
+            )
+        return g.op(op_name, input, axis_i=axis_i, keepdims_i=keepdims_i)
+
+    if _is_none(dim):
+        flattened = _reshape_helper(
+            g, input, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        output = op_wrapper(flattened, axis_i=0, keepdims_i=False)
+        if keepdim:
+            input_shape = g.op("Shape", input)
+            input_shape_shape = g.op("Shape", input_shape)
+            new_shape = g.op(
+                "ConstantOfShape",
+                input_shape_shape,
+                value_t=torch.tensor([1], dtype=torch.int64),
+            )
+            output = g.op("Reshape", output, new_shape)
+        return output
+
+    dim = _parse_arg(dim, "i")
+    return op_wrapper(input, axis_i=dim, keepdims_i=keepdim)
+
+
+def _interpolate_helper(name, dim, interpolate_mode):
+    @quantized_args(True, False, False)
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = _get_interpolate_attributes(g, interpolate_mode, args)
+        align_corners = _maybe_get_scalar(align_corners)
+        coordinate_transformation_mode = (
+            "asymmetric"
+            if interpolate_mode == "nearest"
+            else "align_corners"
+            if align_corners
+            else "half_pixel"
+        )
+
+        if scales is None:
+            input_size = g.op("Shape", input)
+            input_size_beg = _slice_helper(
+                g, input_size, axes=[0], ends=[2], starts=[0]
+            )
+            output_size = g.op(
+                "Cast", output_size, to_i=_C_onnx.TensorProtoDataType.INT64
+            )
+            output_size = g.op("Concat", input_size_beg, output_size, axis_i=0)
+
+            if g.opset >= 13:
+                empty_roi = _optional_input_placeholder_tensor(g)
+                empty_scales = _optional_input_placeholder_tensor(g)
+            else:
+                empty_roi = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+                empty_scales = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+
+            return g.op(
+                "Resize",
+                input,
+                empty_roi,
+                empty_scales,
+                output_size,
+                coordinate_transformation_mode_s=coordinate_transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s="floor",
+            )  # only valid when mode="nearest"
+        else:
+            if g.opset >= 13:
+                empty_roi = _optional_input_placeholder_tensor(g)
+            else:
+                empty_roi = g.op(
+                    "Constant", value_t=torch.tensor([], dtype=torch.float32)
+                )
+
+            return g.op(
+                "Resize",
+                input,
+                empty_roi,
+                scales,
+                coordinate_transformation_mode_s=coordinate_transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s="floor",
+            )  # only valid when mode="nearest"
+
+    return symbolic_fn
+
+
+def __interpolate_helper(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+):
+    mode = _maybe_get_const(mode, "s")
+    if "linear" in mode:
+        mode = "linear"
+    if "cubic" in mode:
+        mode = "cubic"
+    align_corners = _maybe_get_const(align_corners, "b")
+    align_corners = False if not isinstance(align_corners, bool) else align_corners
+    coordinate_transformation_mode = (
+        "asymmetric"
+        if mode == "nearest"
+        else "align_corners"
+        if align_corners
+        else "half_pixel"
+    )
+
+    if not _is_none(size):
+        input_size = g.op("Shape", input)
+        input_size = _slice_helper(g, input_size, axes=[0], ends=[2], starts=[0])
+        # in some cases size is not a packed list but size is a scalar
+        # We need to also verify that (_maybe_get_const(size, "t").dim() == 0)
+        # but this information is not always available. Try to get the dim,
+        # and if not assume that it is not a scalar.
+        try:
+            is_scalar = not _is_packed_list(size) and (
+                _maybe_get_const(size, "t").dim() == 0
+            )
+        except AttributeError:
+            is_scalar = not _is_packed_list(size)
+            if not is_scalar:
+                warnings.warn(
+                    "Cannot verify if the output_size is a scalar "
+                    "while exporting interpolate. Assuming that it is not a scalar."
+                )
+
+        if is_scalar:
+            rank = _get_tensor_rank(input)
+            if rank is None:
+                return _unimplemented(
+                    "interpolate (with a scalar output_size)",
+                    "missing input shape (try giving an array of output_size values)",
+                )
+            size = _unsqueeze_helper(g, size, [0])
+            size = [size for i in range(rank - 2)]
+            size = g.op("Concat", *size, axis_i=0)
+        size = g.op("Cast", size, to_i=_C_onnx.TensorProtoDataType.INT64)
+        size = g.op("Concat", input_size, size, axis_i=0)
+
+        if g.opset >= 13:
+            empty_roi = _optional_input_placeholder_tensor(g)
+            empty_scales = _optional_input_placeholder_tensor(g)
+        else:
+            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+            empty_scales = g.op(
+                "Constant", value_t=torch.tensor([], dtype=torch.float32)
+            )
+
+        return g.op(
+            "Resize",
+            input,
+            empty_roi,
+            empty_scales,
+            size,
+            coordinate_transformation_mode_s=coordinate_transformation_mode,
+            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+            mode_s=mode,  # nearest, linear, or cubic
+            nearest_mode_s="floor",
+        )
+    else:  # if not _is_none(scales)
+        rank = _get_tensor_rank(input)
+        if rank is None:
+            return _unimplemented("interpolate (with scales)", "missing input shape")
+
+        if g.opset >= 13:
+            empty_roi = _optional_input_placeholder_tensor(g)
+        else:
+            empty_roi = g.op("Constant", value_t=torch.tensor([], dtype=torch.float32))
+
+        scales = _interpolate_get_scales(g, scale_factor, rank)
+        return g.op(
+            "Resize",
+            input,
+            empty_roi,
+            scales,
+            coordinate_transformation_mode_s=coordinate_transformation_mode,
+            cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+            mode_s=mode,  # nearest, linear, or cubic
+            nearest_mode_s="floor",
+        )  # only valid when mode="nearest"
+
+
+def _unbind_helper(g: jit_utils.GraphContext, self, dim, _outputs):
+    if g.opset < 11:
+        from torch.onnx.symbolic_opset9 import unbind
+    elif g.opset <= 12:
+        from torch.onnx.symbolic_opset11 import unbind  # type: ignore[no-redef]
+    else:
+        from torch.onnx.symbolic_opset13 import unbind  # type: ignore[no-redef]
+    return unbind(g, self, dim, _outputs)
+
+
+def _scatter_helper(g: jit_utils.GraphContext, self, dim, index, src):
+    if g.opset <= 10:
+        from torch.onnx.symbolic_opset9 import scatter
+    else:
+        # for mypy, scatter was imported two lines above
+        from torch.onnx.symbolic_opset11 import scatter  # type: ignore[no-redef]
+    return scatter(g, self, dim, index, src)
+
+
+def _repeat_interleave_split_helper(g: jit_utils.GraphContext, self, reps, dim):
+    if g.opset <= 12:
+        split_out = g.op("Split", self, split_i=[1] * reps, axis_i=dim, outputs=reps)
+    else:
+        from torch.onnx.symbolic_opset13 import split
+
+        repeats = g.op("Constant", value_t=torch.tensor([1] * reps))
+        split_out = split(g, self, repeats, dim, _outputs=reps)
+    return split_out if reps > 1 else [split_out]
+
+
+def _repeat_interleave_single_value_repeat_helper(
+    g: jit_utils.GraphContext, self, repeats, dim
+):
+    from torch.onnx.symbolic_opset9 import flatten, unsqueeze
+
+    if not _is_tensor(repeats):
+        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+
+    const_repeats: bool = _is_constant(repeats)
+    reps = _maybe_get_const(repeats, "t")
+
+    # Convert 'repeats' to 1-d if it is 0-d.
+    if _get_tensor_rank(repeats) == 0:
+        repeats = g.op("Reshape", repeats, g.op("Constant", value_t=torch.tensor([1])))
+
+    # Create a new dim of size 1, then expand it to be 'repeats' long, and finally collapse it.
+    unsqueezed = unsqueeze(g, self, dim + 1)
+
+    # repeats_per_dim is 1 for all dims except for the new unsqueezed dim, where it has value 'repeats'.
+    if const_repeats:
+        # 'Repeats' is a constant, 'repeats_per_dim' can be a constant.
+        onehot = torch.ones(_get_tensor_rank(unsqueezed), dtype=torch.int64)  # type: ignore[arg-type]
+        onehot[dim + 1] = reps
+        repeats_per_dim = g.op("Constant", value_t=onehot)
+    else:
+        # 'Repeats' is a variable, 'repeats_per_dim' cannot be a constant.
+        onehot = g.op(
+            "OneHot",
+            unsqueeze(g, dim + 1, 0),  # indices, must be >= 1-dimensional
+            g.op(
+                "Constant", value_t=torch.tensor(_get_tensor_rank(unsqueezed))
+            ),  # depth
+            g.op(
+                "Concat", g.op("Constant", value_t=torch.tensor([1])), repeats, axis_i=0
+            ),  # on/off values
+        )
+        repeats_per_dim = flatten(g, onehot, 0, 1)
+
+    tiled = g.op("Tile", unsqueezed, repeats_per_dim)
+    return flatten(g, tiled, dim, dim + 1)
+
+
+def _arange_cast_helper(
+    g: jit_utils.GraphContext, end, start=None, step=None, dtype=None
+) -> tuple[
+    _type_utils.JitScalarType,
+    _C.Value | None,
+    _C.Value | None,
+    _C.Value | None,
+]:
+    def _is_all_integral(scalars):
+        for scalar in scalars:
+            scalar_type = _type_utils.JitScalarType.from_value(
+                scalar, _type_utils.JitScalarType.UNDEFINED
+            )
+            if (
+                scalar_type != _type_utils.JitScalarType.INT64
+                and scalar_type != _type_utils.JitScalarType.UNDEFINED
+            ):
+                return False
+        return True
+
+    # This logic is based on torch.arange docs. If "dtype" is provided,
+    # infer input types from dtype. If not, then check if any of start, stop,
+    # or step are floating point, and infer the type from get_default.
+    # Otherwise, the dtype is inferred to be torch.int64.
+    if dtype is None or (_is_value(dtype) and _is_none(dtype)):
+        if _is_all_integral([start, end, step]):
+            scalar_type = _type_utils.JitScalarType.INT64
+        else:
+            scalar_type = _type_utils.JitScalarType.from_dtype(
+                torch.get_default_dtype()
+            )
+    else:
+        assert isinstance(dtype, int)
+        # TODO(justinchuby): Check if dtype is indeed a int.
+        scalar_type = _type_utils.JitScalarType(dtype)
+
+    start = g.op("Cast", start, to_i=scalar_type.onnx_type()) if start else None
+    end = g.op("Cast", end, to_i=scalar_type.onnx_type()) if end else None
+    step = g.op("Cast", step, to_i=scalar_type.onnx_type()) if step else None
+    return scalar_type, end, start, step
+
+
+def _arange_helper(g: jit_utils.GraphContext, *args):
+    if g.opset <= 10:
+        from torch.onnx.symbolic_opset9 import arange
+    else:
+        from torch.onnx.symbolic_opset11 import arange  # type: ignore[no-redef]
+    return arange(g, *args)
+
+
+def _size_helper(g: jit_utils.GraphContext, self, dim):
+    full_shape = g.op("Shape", self)
+    from torch.onnx.symbolic_opset9 import select
+
+    return select(g, full_shape, g.op("Constant", value_t=torch.tensor([0])), dim)
+
+
+def _index_fill_reshape_helper(g: jit_utils.GraphContext, self, dim, index):
+    # 1. reshape index => [1, ..., 1, dim, 1, ..., 1]
+    # 2. expand index => [..., dim, ...], same shape as self except for dim.
+    # 3. expand value as well.
+    # 4. apply onnx::scatter.
+
+    from torch.onnx.symbolic_opset9 import expand
+
+    if g.opset <= 10:
+        from torch.onnx.symbolic_opset9 import scatter
+    else:
+        # for mypy, scatter was imported two lines above
+        from torch.onnx.symbolic_opset11 import scatter  # type: ignore[no-redef]
+
+    if self.type().dim() is None:
+        return _unimplemented("index_fill", "input rank not accessible")
+    self_dim = self.type().dim()
+    dim_value = _parse_arg(dim, "i")
+    if dim_value < 0:
+        dim_value += self_dim
+    unsqueezed_index = _unsqueeze_helper(
+        g, index, [i for i in range(self_dim) if i != dim_value]
+    )
+    expanded_index_shape = scatter(
+        g, g.op("Shape", self), 0, _unsqueeze_helper(g, dim, [0]), g.op("Shape", index)
+    )
+    expanded_index = expand(g, unsqueezed_index, expanded_index_shape, None)
+    return expanded_index_shape, expanded_index
+
+
+# By default, when any value in the 'shape' input is equal to zero
+# the corresponding dimension value is copied from the input tensor dynamically.
+# allowzero=1 indicates that if any value in the 'shape' input is set to zero,
+# the zero value is honored, similar to NumPy.
+# allowzero=1 is only supported for opset version >= 14.
+def _reshape_helper(g: jit_utils.GraphContext, input, shape, allowzero=0):
+    shape = _maybe_get_const(shape, "is")
+    if not _is_value(shape):
+        shape = g.op("Constant", value_t=torch.LongTensor(shape))
+    if g.opset <= 13:
+        if allowzero == 1:
+            _onnx_opset_unsupported(
+                "Reshape with allowzero=1", GLOBALS.export_onnx_opset_version, 14, input
+            )
+        return g.op("Reshape", input, shape)
+    else:
+        return g.op("Reshape", input, shape, allowzero_i=allowzero)
+
+
+def _batchnorm_helper(
+    g: jit_utils.GraphContext, input, weight, bias, running_mean, running_var
+):
+    from torch.onnx.symbolic_opset9 import _var_mean
+
+    batch_size = _get_tensor_dim_size(input, 0)
+    channel_size = _get_tensor_dim_size(input, 1)
+
+    if weight is None or _is_none(weight):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of batch_norm for unknown channel size.",
+                input,
+            )
+        weight_value = torch.tensor(
+            [1.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or _is_none(bias):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of batch_norm for unknown channel size.",
+                input,
+            )
+        bias_value = torch.tensor(
+            [0.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        bias = g.op("Constant", value_t=bias_value)
+    # If track_running_stats is set to False batch statistics are instead used during evaluation time
+    if (
+        running_mean is None
+        or _is_none(running_mean)
+        or running_var is None
+        or _is_none(running_var)
+    ):
+        assert batch_size is not None and channel_size is not None
+        reshape_in = _reshape_helper(
+            g,
+            input,
+            g.op(
+                "Constant",
+                value_t=torch.tensor([batch_size, channel_size, -1], dtype=torch.int64),
+            ),
+        )
+        trans_in = g.op("Transpose", reshape_in, perm_i=[0, 2, 1])
+        running_var, running_mean = _var_mean(
+            g,
+            trans_in,
+            g.op("Constant", value_t=torch.tensor([0, 1], dtype=torch.int64)),
+            False,
+            False,
+        )
+    return weight, bias, running_mean, running_var
+
+
+def _avgpool_helper(
+    tuple_fn: Callable[[Any], Sequence[int]],
+    padding: int | Sequence[int],
+    kernel_size,
+    stride,
+    divisor_override,
+    name,
+) -> tuple[int, ...]:
+    if divisor_override and divisor_override.node().kind() != "prim::Constant":
+        _unimplemented(name, "divisor_override")
+    return tuple(tuple_fn(padding))
+
+
+def check_training_mode(op_train_mode: int, op_name: str) -> None:
+    """Warns the user if the model's training mode and the export mode do not agree."""
+    if GLOBALS.training_mode == _C_onnx.TrainingMode.PRESERVE:
+        return
+
+    if op_train_mode:
+        op_mode_enum = _C_onnx.TrainingMode.TRAINING
+    else:
+        op_mode_enum = _C_onnx.TrainingMode.EVAL
+    if op_mode_enum == GLOBALS.training_mode:
+        # The modes agree. Do nothing
+        return
+
+    op_mode_text = f"train={bool(op_train_mode)}"
+    # Setting the model mode could result in op_mode != GLOBALS.training_mode
+    # if the model is a FuncModule. In this case we warn the user of
+    # the state and export depending on op_mode
+    # This is to support use-cases of fixing certain layer weights
+    # in training.
+    warnings.warn(
+        f"ONNX export mode is set to {GLOBALS.training_mode}, but operator '{op_name}' "
+        f"is set to {op_mode_text}. Exporting with {op_mode_text}."
+    )
+
+
+def _flatten_helper(g: jit_utils.GraphContext, input, start_dim, end_dim, dim):
+    input_size = g.op("Shape", input)
+    slice1 = _slice_helper(g, input_size, axes=[0], starts=[0], ends=[start_dim])
+    slices = [slice1, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long))]
+    if end_dim < dim - 1:
+        slice3 = _slice_helper(
+            g, input_size, axes=[0], starts=[end_dim + 1], ends=[dim]
+        )
+        slices = [
+            slice1,
+            g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+            slice3,
+        ]
+
+    final_shape = g.op("Concat", *slices, axis_i=0)
+    from torch.onnx.symbolic_opset9 import _reshape_from_tensor
+
+    return _reshape_from_tensor(g, input, final_shape)
+
+
+def _is_split_static(split_size_or_sizes, _outputs):
+    if _outputs is None:
+        return False
+    if (
+        _is_value(split_size_or_sizes)
+        and split_size_or_sizes.node().kind() != "onnx::Constant"
+    ):
+        return False
+    return True
+
+
+def _optional_input_placeholder_tensor(g):
+    n = g.op("prim::Constant")
+    n.setType(_C.OptionalType.ofTensor())
+    return n
+
+
+def _handle_reduce_dim_none(g: jit_utils.GraphContext, self, op_name):
+    rank = _get_tensor_rank(self)
+    if rank is not None and any(
+        _get_tensor_dim_size(self, i) == 0 for i in range(rank)
+    ):
+        # If input tensor is empty, according to ONNX ReduceSum definition,
+        # set keepdims=1 so that the resulted tensor has the same rank as the input.
+        return g.op(op_name, self, keepdims_i=1)
+    return g.op(op_name, self, keepdims_i=0)
+
+
+def dequantize_helper(
+    g: jit_utils.GraphContext,
+    qtensor: _C.Value,
+    qdtype: _C_onnx.TensorProtoDataType | None = None,
+) -> tuple[_C.Value, _C.Value, _C.Value, _C.Value | None]:
+    """Appends to graph `g` ONNX nodes that dequantizes `qtensor` into `tensor`.
+
+    Args:
+        g: Graph, the ONNX IR graph that is under construction.
+        qtensor: torch._C.Value, either a tuple of (quantized_tensor, scale, zero_point)
+            for per tensor quantization, or
+            (quantized_tensor, scale, zero_point, axis) for per channel quantization,
+            representing the quantized tensor.
+        qdtype: torch.onnx.TensorProtoDataType default None, if not None, represents the
+            data type of quantized tensor. It must be either
+            torch.onnx.TensorProtoDataType.UINT8 or torch.onnx.TensorProtoDataType.INT8.
+    """
+    unpacked_qtensors = _unpack_quantized_tensor(qtensor)
+    tensor, scale, zero_point = unpacked_qtensors[:3]
+    axis = unpacked_qtensors[3] if len(unpacked_qtensors) >= 4 else None
+    axis_i = _get_const(axis, "i", "axis")
+    input_qdtype = _type_utils.JitScalarType.from_value(tensor)
+    if qdtype is None:
+        if input_qdtype is not None:
+            qdtype = input_qdtype.onnx_type()
+        else:
+            qdtype = _C_onnx.TensorProtoDataType.UINT8
+    value = g.op("Cast", tensor, to_i=qdtype)
+    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    zero_point = g.op("Cast", zero_point, to_i=qdtype)
+
+    if axis_i is not None and GLOBALS.export_onnx_opset_version < 13:
+        _onnx_opset_unsupported_detailed(
+            "DequantizeLinear",
+            GLOBALS.export_onnx_opset_version,
+            13,
+            "Attribute axis is not supported.",
+            qtensor,
+        )
+
+    return (
+        g.op("DequantizeLinear", value, scale, zero_point, axis_i=axis_i),
+        scale,
+        zero_point,
+        axis,
+    )
+
+
+def quantize_helper(
+    g: jit_utils.GraphContext,
+    tensor: _C.Value,
+    scale: _C.Value,
+    zero_point: _C.Value,
+    axis: _C.Value | None = None,
+) -> _C.Value:
+    """Appends to graph `g` ONNX nodes that quantizes `tensor` based on `scale`, `zero_point` and `axis`.
+
+    Args:
+        g: Graph, the ONNX IR graph that is under construction.
+        tensor: torch._C.Value, representing the tensor to be quantized.
+        scale: torch._C.Value, quantized scale.
+        zero_point: torch._C.Value, quantized zero point.
+        axis: Optional[torch._C.Value] default None, if None, represents per tensor quantization.
+            Otherwise, represents per channel quantization, along given axis.
+
+    Returns:
+        A TupleConstruct storing information of the quantized tensor.
+    """
+    if (
+        axis is not None
+        and not _is_none(axis)
+        and GLOBALS.export_onnx_opset_version < 13
+    ):
+        _onnx_opset_unsupported_detailed(
+            "QuantizeLinear",
+            GLOBALS.export_onnx_opset_version,
+            13,
+            "Attribute axis is not supported.",
+            tensor,
+        )
+
+    assert scale is not None
+    if (
+        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
+        != _type_utils.JitScalarType.FLOAT
+    ):
+        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+    assert zero_point is not None
+    if _type_utils.JitScalarType.from_value(
+        zero_point, _type_utils.JitScalarType.UNDEFINED
+    ) not in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+    }:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    output = g.op(
+        "QuantizeLinear",
+        tensor,
+        scale,
+        zero_point,
+        axis_i=_get_const(axis, "i", "axis"),
+    )
+    args = [output, scale, zero_point]
+    if axis is not None and not _is_none(axis):
+        args.append(axis)
+    return g.op("prim::TupleConstruct", *args)
+
+
+def requantize_bias_helper(
+    g: jit_utils.GraphContext, bias, input_scale, weight_scale, axis=None
+):
+    """In PyTorch, bias is float and is quantized to int32 implicitly inside the quantized ATen op kernel.
+    In ONNX we need to make the quantization explicit because operators expect all of their inputs to be quantized.
+    Since int32 is not a supported output type by ONNX operator `QuantizeLinear`, quantization is exported using
+    regular operators.
+    """
+    bias_scale = g.op("Mul", weight_scale, input_scale)
+    bias_scale_shape = g.op("Shape", bias_scale)
+    bias_zero_point = g.op(
+        "ConstantOfShape", bias_scale_shape, value_t=torch.tensor([0], dtype=torch.int)
+    )
+    q_bias = g.op(
+        "Cast", g.op("Div", bias, bias_scale), to_i=_C_onnx.TensorProtoDataType.INT32
+    )
+    axis_args = []
+    if axis is not None and not _is_none(axis):
+        axis_args.append(axis)
+    return g.op("prim::TupleConstruct", q_bias, bias_scale, bias_zero_point, *axis_args)
+
+
+def args_have_same_dtype(args):
+    assert args
+    base_dtype = _type_utils.JitScalarType.from_value(args[0])
+    has_same_dtype = all(
+        _type_utils.JitScalarType.from_value(elem) == base_dtype for elem in args
+    )
+    return has_same_dtype
+
+
+def _op_with_optional_float_cast(g: jit_utils.GraphContext, op_name, *args, **kwargs):
+    """Some PyTorch operators (e.g., Clip/Min/ReLU/Pad) are super set of ONNX in terms of data types.
+    This function maximizes the exportability of PyTorch-ONNX by allowing ONNX-unsupported PyTorch
+    operator data type. For example, `Cast<int>(Clip<float>(Cast<float>(INPUT)))` can be used to mimic
+    `Clip<int>(INPUT)` (opset version < 12).
+
+    Args:
+        g (torch._C.Graph): graph to write the ONNX representation into.
+        op_name (str): operator name in ONNX.
+        *args (tuple): operands to the operator.
+        **kwargs (dict): attributes to the operator along with "opset_before" (optional, None by default)
+            indicating the smallest opset version to trigger such casting behavior and "target_float_t"
+            (optional, torch.onnx.JitScalarType.FLOAT by default) indicating the data type of internal operator.
+
+    Returns:
+        Optional[torch._C.Value, Tuple[torch._C.Value, ...]]: output(s) of the operator.
+    """
+    opset_before = kwargs.pop("opset_before", None)
+    target_float_t = kwargs.pop("target_float_t", _type_utils.JitScalarType.FLOAT)
+
+    inputs = list(args)
+    dtype_0 = _type_utils.JitScalarType.from_value(inputs[0])
+
+    require_cast = not _is_fp(inputs[0]) and (
+        opset_before is None or GLOBALS.export_onnx_opset_version < opset_before
+    )
+
+    if require_cast:
+        for input in inputs:
+            if input.isCompleteTensor():
+                input_scalar_type = _type_utils.JitScalarType.from_value(input)
+                if input_scalar_type != dtype_0:
+                    raise errors.SymbolicValueError(
+                        f"Inputs of {op_name} must have same dtype."
+                        f"Got {dtype_0.scalar_name()} and {input_scalar_type.scalar_name()}",
+                        input,
+                    )
+        for i, input in enumerate(inputs):
+            if input.isCompleteTensor() and not _is_fp(input):
+                inputs[i] = g.op(
+                    "Cast",
+                    input,
+                    to_i=target_float_t.onnx_type(),
+                )
+
+    self = g.op(op_name, *inputs, **kwargs)
+
+    if require_cast:
+        self = g.op("Cast", self, to_i=dtype_0.onnx_type())
+
+    return self
+
+
+def _maybe_cast_reduce_op_input(g: jit_utils.GraphContext, self):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        # This check only covers traced modules where dtype is present
+        # pytorch reduce-ops cast all other integral types to int64
+        if not _is_fp(self) and scalar_type != _type_utils.JitScalarType.INT64:
+            self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return self
+
+
+def _apply_params(*args, **kwargs):
+    """Returns a decorator that calls the decorated (higher-order) function with the given parameters."""
+
+    def _apply(fn):
+        return fn(*args, **kwargs)
+
+    return _apply
+
+
+def _reduce_op_symbolic_helper(onnx_op_name, allow_multi_dim_support=True):
+    def symbolic(g, self, dim=None, keepdim=None):
+        self = _maybe_cast_reduce_op_input(g, self)
+        if dim is None or dim == ():
+            # Dim can be 0, which will cause (not dim) == True. So we don't want to do
+            # (not dim)
+            # all-reduce path
+            return _handle_reduce_dim_none(g, self, onnx_op_name)
+        else:
+            # dim-reduce path
+            keepdim = _get_const(keepdim, "i", "keepdim")
+            if g.opset < 18:
+                desc = "is" if allow_multi_dim_support else "i"
+                dim = _get_const(dim, desc, "dim")
+                dim_list = dim if allow_multi_dim_support else [dim]
+                return g.op(onnx_op_name, self, axes_i=dim_list, keepdims_i=keepdim)
+            else:
+                if _is_value(dim):
+                    axes = dim
+                else:
+                    if allow_multi_dim_support:
+                        axes = g.op(
+                            "Constant", value_t=torch.tensor(dim, dtype=torch.long)
+                        )
+                    else:
+                        axes = g.op(
+                            "Constant", value_t=torch.tensor([dim], dtype=torch.long)
+                        )
+                return g.op(onnx_op_name, self, axes, keepdims_i=keepdim)
+
+    return symbolic
+
+
+def _overload_by_arg_count(fn):
+    @functools.wraps(fn)
+    def wrapper(g, *args):
+        overloads = fn(g, *args)
+        for overload in overloads:
+            arg_descriptors = overload._arg_descriptors
+            if len(arg_descriptors) == len(args):
+                return overload(g, *args)
+        return _unimplemented(f"aten::{fn.__name__}", f"with {len(args)} arguments")
+
+    return wrapper
+
+
+def _reduce_with_dtype_helper(
+    onnx_op: str, name: str, allow_multi_dim_support: bool = True
+):
+    symbolic = _reduce_op_symbolic_helper(
+        onnx_op, allow_multi_dim_support=allow_multi_dim_support
+    )
+
+    @_overload_by_arg_count
+    def reduce(g, *args, **kwargs):
+        @quantized_args(True)
+        @parse_args("v", "none")
+        def reduce_nodim(g, self, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = _get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return _unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        dim_desc = "is" if allow_multi_dim_support else "i"
+
+        @quantized_args(True)
+        @parse_args("v", dim_desc, "i", "none")  # type: ignore[arg-type]
+        def reduce_dim(g, self, dim, keepdim, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = _get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return _unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self, dim, keepdim)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        return reduce_nodim, reduce_dim
+
+    return reduce
+
+
+def _max_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.max(input)
+    if dim_or_y is None and keepdim is None:
+        return g.op("ReduceMax", self, keepdims_i=0)
+    # torch.max(input, other)
+    if keepdim is None:
+        return _op_with_optional_float_cast(g, "Max", self, dim_or_y, opset_before=12)
+    # torch.max(input, dim, keepdim)
+    else:
+        keepdim = _get_const(keepdim, "i", "keepdim")
+        dim = _get_const(dim_or_y, "i", "dim")
+        if g.opset < 18:
+            max = g.op("ReduceMax", self, axes_i=[dim], keepdims_i=keepdim)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            max = g.op("ReduceMax", self, axes, keepdims_i=keepdim)
+        indices = g.op("ArgMax", self, axis_i=dim, keepdims_i=keepdim)
+        return max, indices
+
+
+def _min_helper(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.min(input)
+    if dim_or_y is None and keepdim is None:
+        return g.op("ReduceMin", self, keepdims_i=0)
+    # torch.min(input, other)
+    if keepdim is None:
+        return _op_with_optional_float_cast(g, "Min", self, dim_or_y, opset_before=12)
+    # torch.min(input, dim, keepdim)
+    else:
+        keepdim = _get_const(keepdim, "i", "keepdim")
+        dim = _get_const(dim_or_y, "i", "dim")
+        if g.opset < 18:
+            min = g.op("ReduceMin", self, axes_i=[dim], keepdims_i=keepdim)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            min = g.op("ReduceMin", self, axes, keepdims_i=keepdim)
+        indices = g.op("ArgMin", self, axis_i=dim, keepdims_i=keepdim)
+        return min, indices
+
+
+def _numel_helper(g: jit_utils.GraphContext, self):
+    shape = g.op("Shape", self)
+    return g.op("ReduceProd", shape, keepdims_i=0)
+
+
+@parse_args("v", "is", "i", "i")
+def _var_mean_helper(g: jit_utils.GraphContext, input, dim, correction, keepdim):
+    if g.opset < 18:
+        if dim is None:
+            mean = g.op("ReduceMean", input, keepdims_i=0)
+            t_mean = mean
+            num_elements = _numel_helper(g, input)
+        else:
+            mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=keepdim)
+            t_mean = g.op("ReduceMean", input, axes_i=dim, keepdims_i=1)
+            redudced_dims = g.op("Shape", input)
+            # dim could contain one or multiple dimensions
+            redudced_dims = g.op(
+                "Gather",
+                redudced_dims,
+                g.op("Constant", value_t=torch.tensor(dim)),
+                axis_i=0,
+            )
+            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
+        sub_v = g.op("Sub", input, t_mean)
+        sqr_sub = g.op("Mul", sub_v, sub_v)
+        keepdim_mean = 0 if dim is None else keepdim
+        var = g.op("ReduceMean", sqr_sub, axes_i=dim, keepdims_i=keepdim_mean)
+        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
+        if correction is None:
+            correction = 1
+        if correction != 0:
+            num_elements = g.op(
+                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
+            )
+            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
+            mul = g.op("Mul", var, num_elements)
+            var = g.op("Div", mul, g.op("Sub", num_elements, one))
+        return var, mean
+    else:
+        axes = None
+        if dim is None:
+            mean = g.op("ReduceMean", input, keepdims_i=0)
+            t_mean = mean
+            num_elements = _numel_helper(g, input)
+        else:
+            axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+            mean = g.op("ReduceMean", input, axes, keepdims_i=keepdim)
+            t_mean = g.op("ReduceMean", input, axes, keepdims_i=1)
+            redudced_dims = g.op("Shape", input)
+            # dim could contain one or multiple dimensions
+            redudced_dims = g.op(
+                "Gather",
+                redudced_dims,
+                g.op("Constant", value_t=torch.tensor(dim)),
+                axis_i=0,
+            )
+            num_elements = g.op("ReduceProd", redudced_dims, keepdims_i=0)
+        sub_v = g.op("Sub", input, t_mean)
+        sqr_sub = g.op("Mul", sub_v, sub_v)
+        keepdim_mean = 0 if dim is None else keepdim
+        if axes is None:
+            var = g.op("ReduceMean", sqr_sub, keepdims_i=keepdim_mean)
+        else:
+            var = g.op("ReduceMean", sqr_sub, axes, keepdims_i=keepdim_mean)
+        # Correct bias in calculating variance, by dividing it over (N - correction) instead on N
+        if correction is None:
+            correction = 1
+        if correction != 0:
+            num_elements = g.op(
+                "Cast", num_elements, to_i=_C_onnx.TensorProtoDataType.FLOAT
+            )
+            one = g.op("Constant", value_t=torch.tensor(correction, dtype=torch.float))
+            mul = g.op("Mul", var, num_elements)
+            var = g.op("Div", mul, g.op("Sub", num_elements, one))
+        return var, mean
+
+
+def _embedding_bag_helper(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        return _onnx_unsupported(
+            "embedding_bag with scale_grad_by_freq for training mode"
+        )
+    if padding_idx is not None and padding_idx >= 0:
+        raise RuntimeError("embedding_bag with padding_idx")
+
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    zero = g.op("Constant", value_t=torch.tensor([0]))
+
+    indices_len = _unsqueeze_helper(
+        g,
+        _size_helper(g, indices, g.op("Constant", value_t=torch.tensor(0))),
+        [0],
+    )
+    if not include_last_offset:
+        offsets = [offsets, indices_len]
+        offsets = g.op("Concat", *offsets, axis_i=0)
+
+    # Offsets holds the starting index position of each bag. So we create a list of the indices slices (determined by
+    # offsets) and gather those indices in indices_row. Then we use this subset of indices to gather from embeddings.
+    # The embeddings output is a loop scan output, so we can avoid creating a sequence and inserting elements in.
+    offsets_starts = _slice_helper(
+        g, offsets, axes=[0], starts=[0], ends=[sys.maxsize], steps=[1]
+    )
+    offsets_ends = _slice_helper(
+        g, offsets, axes=[0], starts=[1], ends=[sys.maxsize], steps=[1]
+    )
+
+    loop_len = _size_helper(g, offsets_ends, g.op("Constant", value_t=torch.tensor(0)))
+
+    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+        g, "Loop", loop_len, loop_condition, n_blocks=1
+    )
+    loop_block = loop_context.block
+
+    # FIXME(justinchuby): We need to handle what happens when we call b.op on a node return
+    block_input_iter = utils._add_input_to_block(loop_block)
+    cond = utils._add_input_to_block(loop_block)
+
+    indices_start = loop_context.op(
+        "Gather", offsets_starts, block_input_iter, axis_i=0
+    )
+    indices_end = loop_context.op("Gather", offsets_ends, block_input_iter, axis_i=0)
+    indices_start = _unsqueeze_helper(loop_context, indices_start, [0])
+    indices_end = _unsqueeze_helper(loop_context, indices_end, [0])
+
+    indices_row = loop_context.op("Slice", indices, indices_start, indices_end, zero)
+    embeddings = loop_context.op("Gather", embedding_matrix, indices_row, axis_i=0)
+    if not _is_none(per_sample_weights):
+        per_sample_weights_row = loop_context.op(
+            "Slice", per_sample_weights, indices_start, indices_end, zero
+        )
+        per_sample_weights_row = _unsqueeze_helper(
+            loop_context, per_sample_weights_row, [1]
+        )
+        embeddings = loop_context.op("Mul", embeddings, per_sample_weights_row)
+    if mode == 0:
+        embeddings = _reducesum_helper(
+            loop_context, embeddings, axes_i=[0], keepdims_i=0
+        )
+    elif mode == 1:
+        if loop_context.opset < 18:
+            embeddings = loop_context.op(
+                "ReduceMean", embeddings, axes_i=[0], keepdims_i=0
+            )
+        else:
+            axes = loop_context.op(
+                "Constant", value_t=torch.tensor([0], dtype=torch.long)
+            )
+            embeddings = loop_context.op("ReduceMean", embeddings, axes, keepdims_i=0)
+    else:
+        if loop_context.opset < 18:
+            embeddings = loop_context.op(
+                "ReduceMax", embeddings, axes_i=[0], keepdims_i=0
+            )
+        else:
+            axes = loop_context.op(
+                "Constant", value_t=torch.tensor([0], dtype=torch.long)
+            )
+            embeddings = loop_context.op("ReduceMax", embeddings, axes, keepdims_i=0)
+
+    cond_out = loop_context.op(
+        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+    )
+    utils._add_output_to_block(loop_block, cond_out)
+    utils._add_output_to_block(loop_block, embeddings)
+
+    # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
+    # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
+    return loop.node().output(), None, None, None
+
+
+def _linalg_vector_norm_helper(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    axes = None
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.vector_norm.html
+    if _is_none(dim):
+        self = _reshape_helper(g, self, [-1])
+        keepdim = False
+    elif g.opset >= 18:
+        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+
+    if ord == math.inf:
+        if g.opset < 18:
+            result = g.op(
+                "ReduceMax", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
+            )
+        else:
+            if axes is None:
+                result = g.op("ReduceMax", g.op("Abs", self), keepdims_i=keepdim)
+            else:
+                result = g.op("ReduceMax", g.op("Abs", self), axes, keepdims_i=keepdim)
+    elif ord == -math.inf:
+        if g.opset < 18:
+            result = g.op(
+                "ReduceMin", g.op("Abs", self), axes_i=dim, keepdims_i=keepdim
+            )
+        else:
+            if axes is None:
+                result = g.op("ReduceMin", g.op("Abs", self), keepdims_i=keepdim)
+            else:
+                result = g.op("ReduceMin", g.op("Abs", self), axes, keepdims_i=keepdim)
+    elif ord == 0:
+        if g.opset < 11:
+            return _onnx_opset_unsupported_detailed(
+                "linalg_vector_norm", 9, 11, "ord=0 not supported", self
+            )
+        else:
+            if dim is None:
+                self = _reshape_helper(
+                    g,
+                    self,
+                    g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
+                )
+                keepdim = False
+
+            cond_op = g.op(
+                "Not",
+                g.op("Equal", self, g.op("Constant", value_t=torch.LongTensor([0]))),
+            )
+            cond_op = g.op(
+                "Cast",
+                cond_op,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+            return _reducesum_helper(g, cond_op, axes_i=dim, keepdims_i=keepdim)
+    elif ord == 1:
+        if g.opset < 18:
+            result = _reduce_op_symbolic_helper("ReduceL1")(
+                g, self, dim=dim, keepdim=keepdim
+            )
+        else:
+            if axes is None:
+                result = _reduce_op_symbolic_helper("ReduceL1")(
+                    g, self, keepdim=keepdim
+                )
+            else:
+                result = _reduce_op_symbolic_helper("ReduceL1")(
+                    g, self, axes, keepdim=keepdim
+                )
+    elif ord == 2:
+        if g.opset < 18:
+            result = _reduce_op_symbolic_helper("ReduceL2")(
+                g, self, dim=dim, keepdim=keepdim
+            )
+        else:
+            if axes is None:
+                result = _reduce_op_symbolic_helper("ReduceL2")(
+                    g, self, keepdim=keepdim
+                )
+            else:
+                result = _reduce_op_symbolic_helper("ReduceL2")(
+                    g, self, axes, keepdim=keepdim
+                )
+    else:
+        ord_op = g.op("Constant", value_t=torch.tensor(ord, dtype=torch.float32))
+        result = _reducesum_helper(
+            g, g.op("Pow", g.op("Abs", self), ord_op), axes_i=dim, keepdims_i=keepdim
+        )
+        result = g.op(
+            "Pow",
+            result,
+            g.op(
+                "Div",
+                g.op("Constant", value_t=torch.tensor(1, dtype=torch.float32)),
+                ord_op,
+            ),
+        )
+
+    if not _is_none(dtype):
+        dtype = _get_const(dtype, "i", "dtype")
+        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())  # type: ignore[arg-type]
+    return result
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+# TODO: remove these once we support Type's in the JIT IR and we can once again
+# use the unified toType operator
+cast_pytorch_to_onnx = {
+    "Byte": _C_onnx.TensorProtoDataType.UINT8,
+    "Char": _C_onnx.TensorProtoDataType.INT8,
+    "Double": _C_onnx.TensorProtoDataType.DOUBLE,
+    "Float": _C_onnx.TensorProtoDataType.FLOAT,
+    "Half": _C_onnx.TensorProtoDataType.FLOAT16,
+    "Int": _C_onnx.TensorProtoDataType.INT32,
+    "Long": _C_onnx.TensorProtoDataType.INT64,
+    "Short": _C_onnx.TensorProtoDataType.INT16,
+    "Bool": _C_onnx.TensorProtoDataType.BOOL,
+    "ComplexFloat": _C_onnx.TensorProtoDataType.COMPLEX64,
+    "ComplexDouble": _C_onnx.TensorProtoDataType.COMPLEX128,
+    "BFloat16": _C_onnx.TensorProtoDataType.BFLOAT16,
+    "Undefined": _C_onnx.TensorProtoDataType.UNDEFINED,
+}
+
+# Deprecated. Internally use _type_utils.ScalarType
+scalar_name_to_pytorch = {
+    "uint8_t": "Byte",
+    "int8_t": "Char",
+    "double": "Double",
+    "float": "Float",
+    "half": "Half",
+    "int": "Int",
+    "int64_t": "Long",
+    "int16_t": "Short",
+    "bool": "Bool",
+    "complex64": "ComplexFloat",
+    "complex128": "ComplexDouble",
+    "qint8": "QInt8",
+    "quint8": "QUInt8",
+    "qint32": "QInt32",
+    "bfloat16": "BFloat16",
+}
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+# This indicates each scalar type's corresponding
+# torch type. Related source:
+# https://github.com/pytorch/pytorch/blob/344defc9733a45fee8d0c4d3f5530f631e823196/c10/core/ScalarType.h
+scalar_type_to_pytorch_type = [
+    torch.uint8,  # 0
+    torch.int8,  # 1
+    torch.short,  # 2
+    torch.int,  # 3
+    torch.int64,  # 4
+    torch.half,  # 5
+    torch.float,  # 6
+    torch.double,  # 7
+    torch.complex32,  # 8
+    torch.complex64,  # 9
+    torch.complex128,  # 10
+    torch.bool,  # 11
+    torch.qint8,  # 12
+    torch.quint8,  # 13
+    torch.qint32,  # 14
+    torch.bfloat16,  # 15
+]
+
+# Deprecated. Internally use _type_utils.ScalarType
+# source of truth is
+# https://github.com/pytorch/pytorch/blob/master/torch/csrc/utils/tensor_dtypes.cpp
+pytorch_name_to_type = {
+    "Byte": torch.uint8,
+    "Char": torch.int8,
+    "Double": torch.double,
+    "Float": torch.float,
+    "Half": torch.half,
+    "Int": torch.int,
+    "Long": torch.int64,
+    "Short": torch.short,
+    "Bool": torch.bool,
+    "ComplexFloat": torch.complex64,
+    "ComplexDouble": torch.complex128,
+    "QInt8": torch.qint8,
+    "QUInt8": torch.quint8,
+    "QInt32": torch.qint32,
+    "BFloat16": torch.bfloat16,
+}
+
+
+# Deprecated. Internally use _type_utils.ScalarType
+scalar_type_to_onnx = [
+    cast_pytorch_to_onnx["Byte"],  # 0
+    cast_pytorch_to_onnx["Char"],  # 1
+    cast_pytorch_to_onnx["Short"],  # 2
+    cast_pytorch_to_onnx["Int"],  # 3
+    cast_pytorch_to_onnx["Long"],  # 4
+    cast_pytorch_to_onnx["Half"],  # 5
+    cast_pytorch_to_onnx["Float"],  # 6
+    cast_pytorch_to_onnx["Double"],  # 7
+    cast_pytorch_to_onnx["Undefined"],  # 8
+    cast_pytorch_to_onnx["ComplexFloat"],  # 9
+    cast_pytorch_to_onnx["ComplexDouble"],  # 10
+    cast_pytorch_to_onnx["Bool"],  # 11
+    cast_pytorch_to_onnx["Char"],  # 12
+    cast_pytorch_to_onnx["Byte"],  # 13
+    cast_pytorch_to_onnx["Int"],  # 14
+    cast_pytorch_to_onnx["BFloat16"],  # 15
+]
+
+# Global set to store the list of quantized operators in the network.
+# This is currently only used in the conversion of quantized ops from PT -> C2 via ONNX.
+_quantized_ops: set[int] = set()
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset10.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset10.py
new file mode 100644
index 0000000000000000000000000000000000000000..7407cb6b2dbe1adc890a04441665ded2fbc6d366
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset10.py
@@ -0,0 +1,1184 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+from __future__ import annotations
+
+import functools
+import sys
+import warnings
+from typing import Sequence
+
+import torch
+import torch._C._onnx as _C_onnx
+import torch.onnx
+from torch import _C
+
+# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
+from torch.onnx import (
+    _constants,
+    _type_utils,
+    errors,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+)
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import jit_utils, registration
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 10
+# Opset 10 is supported by ONNX release 1.5.0
+# release on 04/24/19
+
+
+__all__ = [
+    "dequantize",
+    "div",
+    "embedding_bag",
+    "fake_quantize_per_tensor_affine",
+    "flip",
+    "fmod",
+    "isfinite",
+    "isinf",
+    "nan_to_num",
+    "quantize_per_tensor",
+    "quantized_add_relu",
+    "quantized_add",
+    "quantized_cat",
+    "quantized_conv1d_relu",
+    "quantized_conv2d_relu",
+    "quantized_conv3d_relu",
+    "quantized_conv1d",
+    "quantized_conv2d",
+    "quantized_conv3d",
+    "quantized_conv_transpose1d",
+    "quantized_conv_transpose2d",
+    "quantized_conv_transpose3d",
+    "quantized_group_norm",
+    "quantized_hardswish",
+    "quantized_instance_norm",
+    "quantized_layer_norm",
+    "quantized_leaky_relu",
+    "quantized_linear",
+    "quantized_linear_relu",
+    "quantized_mul",
+    "quantized_sigmoid",
+    "slice",
+    "sort",
+    "topk",
+]
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=10)
+
+
+@_onnx_symbolic("aten::div")
+def div(g: jit_utils.GraphContext, self, other, *args):
+    if len(args) == 0:
+        return opset9.true_divide(g, self, other)
+    else:
+        return _div_rounding_mode(g, self, other, *args)
+
+
+@symbolic_helper.parse_args("v", "v", "s")
+def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
+    if rounding_mode == "floor":
+        return _floor_divide(g, self, other)
+    else:
+        return opset9._div_rounding_mode(g, self, other, rounding_mode)
+
+
+@_onnx_symbolic("aten::_floor_divide")
+def _floor_divide(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        out = opset9.true_divide(g, self, other)
+        return g.op("Floor", out)
+    else:
+        # Integer division does trunction rounding
+        div = g.op("Div", self, other)
+        # Division is negative if: self < 0 != other < 0
+        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+        negative = g.op("Xor", g.op("Less", self, zero), g.op("Less", other, zero))
+
+        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
+        mod = g.op("Mod", self, other, fmod_i=0)
+        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
+
+        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        fixup = g.op("Sub", div, one)
+        return g.op("Where", fixup_mask, fixup, div)
+
+
+@_onnx_symbolic("aten::sort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def sort(g: jit_utils.GraphContext, self, dim, decending, out=None):
+    return symbolic_helper._sort_helper(g, self, dim, decending=decending, out=out)
+
+
+@_onnx_symbolic("aten::topk")
+@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    return symbolic_helper._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out
+    )
+
+
+def _aten_max_pool_onnx(
+    g: jit_utils.GraphContext,
+    self: _C.Value,
+    kernel_shape: Sequence[int],
+    strides: Sequence[int],
+    pads: Sequence[int],
+    dilations: Sequence[int],
+    ceil_mode: bool,
+    unbatched_rank: int,
+) -> _C.Value:
+    self_rank = g.op("Size", g.op("Shape", self))
+    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
+        self = g.op(
+            "Unsqueeze",
+            self,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    pool_result, _ = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        ceil_mode_i=ceil_mode,
+        dilations_i=dilations,
+        kernel_shape_i=kernel_shape,
+        pads_i=pads,
+        strides_i=strides,
+    )
+
+    if self_rank == unbatched_rank:
+        pool_result = g.op(
+            "Squeeze",
+            pool_result,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    return pool_result
+
+
+# For MaxPool
+def _adjust_attributes_of_max_pool(
+    expand_size: int,
+    kernel_size: Sequence[int] | int,
+    stride: Sequence[int] | int,
+    padding: Sequence[int] | int,
+    dilation: Sequence[int] | int,
+) -> tuple[Sequence[int], Sequence[int], Sequence[int], Sequence[int]]:
+    """Adjust attributes of avg_pool to match ONNX specification."""
+
+    if isinstance(dilation, int):
+        dilation = [dilation] * expand_size
+
+    if isinstance(kernel_size, int):
+        kernel_shape = [kernel_size] * expand_size
+    else:
+        kernel_shape = kernel_size  # type: ignore[assignment]
+
+    if isinstance(padding, int):
+        pads = [padding] * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 1:
+        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 2:
+        # 2D padding
+        pads = padding * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 3:
+        # 3D padding
+        pads = padding * 2  # type: ignore[operator, assignment]
+    else:
+        # When padding is already done for all dimensions,
+        # we don't need to double it
+        # eg: (1, 1, 1, 1, 1, 1)
+        pads = padding  # type: ignore[assignment]
+
+    if isinstance(stride, int):
+        strides = [stride] * expand_size
+    elif not stride:
+        strides = kernel_shape
+    else:
+        strides = stride  # type: ignore[assignment]
+
+    return (kernel_shape, strides, pads, dilation)
+
+
+def _aten_max_pool_with_indices_onnx(
+    g: jit_utils.GraphContext,
+    self: _C.Value,
+    kernel_shape: Sequence[int],
+    strides: Sequence[int],
+    pads: Sequence[int],
+    dilations: Sequence[int],
+    ceil_mode: bool,
+    unbatched_rank: int,
+    n_dims_one: Sequence[int],
+    n_dims_zero: Sequence[int],
+    n_dims_axes: Sequence[int],
+) -> tuple[_C.Value, Sequence[int]]:
+    self_rank = g.op("Size", g.op("Shape", self))
+    if self_rank == unbatched_rank:  # C,H,W -> N,C,H,W and N=1
+        self = g.op(
+            "Unsqueeze",
+            self,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    pool_result, indices = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        ceil_mode_i=ceil_mode,
+        dilations_i=dilations,
+        kernel_shape_i=kernel_shape,
+        pads_i=pads,
+        strides_i=strides,
+    )
+    _, flatten_indices = g.op(
+        "MaxPool",
+        self,
+        outputs=2,
+        dilations_i=dilations,
+        kernel_shape_i=n_dims_one,
+        strides_i=n_dims_one,
+    )
+
+    ends = g.op("Constant", value_t=torch.tensor(n_dims_one))
+    starts = g.op("Constant", value_t=torch.tensor(n_dims_zero))
+    axes = g.op("Constant", value_t=torch.tensor(n_dims_axes))
+
+    delta = g.op("Slice", flatten_indices, starts, ends, axes)
+    indices = g.op("Sub", indices, delta)
+
+    if self_rank == unbatched_rank:
+        pool_result = g.op(
+            "Squeeze", pool_result, value_t=torch.tensor([0], dtype=torch.int64)
+        )
+        indices = g.op("Squeeze", indices, value_t=torch.tensor([0], dtype=torch.int64))
+
+    return (pool_result, indices)
+
+
+@_onnx_symbolic(
+    "aten::max_pool1d",
+    decorate=[symbolic_helper._apply_params("max_pool1d", 1, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d",
+    decorate=[symbolic_helper._apply_params("max_pool2d", 2, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d",
+    decorate=[symbolic_helper._apply_params("max_pool3d", 3, return_indices=False)],
+)
+@_onnx_symbolic(
+    "aten::max_pool1d_with_indices",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool1d_with_indices",
+            1,
+            return_indices=True,
+        )
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d_with_indices",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool2d_with_indices",
+            2,
+            return_indices=True,
+        )
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d_with_indices",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool3d_with_indices",
+            3,
+            return_indices=True,
+        )
+    ],
+)
+def _max_pool(name: str, expand_size: int, return_indices: bool):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
+    def symbolic_fn(
+        g: jit_utils.GraphContext,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: int | Sequence[int],
+        dilation: Sequence[int],
+        ceil_mode: bool,
+    ):
+        kernel_shape, strides, pads, dilations = _adjust_attributes_of_max_pool(
+            expand_size, kernel_size, stride, padding, dilation
+        )
+
+        if return_indices:
+            return _aten_max_pool_with_indices_onnx(
+                g,
+                input,
+                kernel_shape,
+                strides,
+                pads,
+                dilations,
+                ceil_mode,
+                expand_size + 1,
+                ([1] * expand_size),
+                ([0] * expand_size),
+                ([2 + i for i in range(expand_size)]),
+            )
+        else:
+            return _aten_max_pool_onnx(
+                g,
+                input,
+                kernel_shape,
+                strides,
+                pads,
+                dilations,
+                ceil_mode,
+                expand_size + 1,
+            )
+
+    return symbolic_fn
+
+
+# For AvgPool
+def _adjust_attributes_of_avg_pool(
+    expand_size: int,
+    kernel_size: Sequence[int] | int,
+    stride: Sequence[int] | int,
+    padding: Sequence[int] | int,
+) -> tuple[Sequence[int], Sequence[int], Sequence[int]]:
+    """Adjust attributes of avg_pool to match ONNX specification."""
+
+    if isinstance(kernel_size, int):
+        kernel_shape = [kernel_size] * expand_size
+    else:
+        kernel_shape = kernel_size  # type: ignore[assignment]
+
+    if isinstance(padding, int):
+        pads = [padding] * expand_size * 2
+    elif len(padding) == 1:
+        pads = padding * expand_size * 2  # type: ignore[operator, assignment]
+    elif len(padding) == 2:
+        pads = padding * expand_size  # type: ignore[operator, assignment]
+    else:
+        pads = padding * 2  # type: ignore[operator, assignment]
+
+    if isinstance(stride, int):
+        strides = [stride] * expand_size
+    elif not stride:
+        strides = kernel_shape
+    else:
+        strides = stride  # type: ignore[assignment]
+
+    return (kernel_shape, strides, pads)
+
+
+@_onnx_symbolic(
+    "aten::avg_pool1d",
+    decorate=[symbolic_helper._apply_params("avg_pool1d", 1)],
+)
+@_onnx_symbolic(
+    "aten::avg_pool2d",
+    decorate=[symbolic_helper._apply_params("avg_pool2d", 2)],
+)
+@_onnx_symbolic(
+    "aten::avg_pool3d",
+    decorate=[symbolic_helper._apply_params("avg_pool3d", 3)],
+)
+def _avg_pool(name, expand_size):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
+    def symbolic_fn(
+        g,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: int | Sequence[int],
+        ceil_mode: int,
+        count_include_pad: int,
+        divisor_override=None,
+    ):
+        kernel_shape, strides, pads = _adjust_attributes_of_avg_pool(
+            expand_size, kernel_size, stride, padding
+        )
+
+        result = g.op(
+            "AveragePool",
+            input,
+            ceil_mode_i=ceil_mode,
+            count_include_pad_i=count_include_pad,
+            kernel_shape_i=kernel_shape,
+            pads_i=pads,
+            strides_i=strides,
+        )
+
+        return result
+
+    return symbolic_fn
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+)
+def _interpolate(name, dim, interpolate_mode):
+    @symbolic_helper.quantized_args(True, False, False)
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        if scales is None:
+            scales = symbolic_helper._interpolate_size_to_scales(
+                g, input, output_size, dim
+            )
+        return g.op("Resize", input, scales, mode_s=interpolate_mode)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Resize", input, scales, mode_s=mode)
+
+
+def _slice(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    axes: list | torch.Tensor | torch._C.Value,
+    starts: list | torch.Tensor | torch._C.Value,
+    ends: list | torch.Tensor | torch._C.Value,
+    steps: list | torch.Tensor | torch._C.Value | None = None,
+):
+    def is_none_value(value):
+        if value is None:
+            return True
+        return (
+            isinstance(value, torch._C.Value)
+            and value.node().kind() == "prim::Constant"
+            and isinstance(value.type(), _C.NoneType)
+        )
+
+    def to_slice_input(list_or_value, default_value=None):
+        # Convert input param into a 1D torch.Value.
+        if is_none_value(list_or_value) and default_value is not None:
+            list_or_value = [default_value]
+
+        if isinstance(list_or_value, (list, torch.Tensor)):
+            return g.op("Constant", value_t=torch.tensor(list_or_value))
+
+        rank = symbolic_helper._get_tensor_rank(list_or_value)
+        if rank == 0:
+            return symbolic_helper._unsqueeze_helper(g, list_or_value, [0])
+        if rank == 1:
+            return list_or_value
+        raise errors.SymbolicValueError(
+            f"Rank must be 0 or 1, not {rank}", list_or_value
+        )
+
+    def get_const_value(list_or_value):
+        if isinstance(list_or_value, (list, torch.Tensor)):
+            if len(list_or_value) == 1:
+                return list_or_value[0]
+            return None
+        return symbolic_helper._maybe_get_const(list_or_value, "i")
+
+    # Check if slice is a no-op
+    if (
+        get_const_value(starts) == 0
+        and get_const_value(ends) == _constants.INT64_MAX
+        and (steps is None or get_const_value(steps) == 1)
+    ):
+        return input
+
+    axes = to_slice_input(axes)
+    starts = to_slice_input(starts, default_value=0)
+    ends = to_slice_input(ends, default_value=_constants.INT64_MAX)
+    if steps is None:
+        return g.op("Slice", input, starts, ends, axes)
+    steps = to_slice_input(steps, default_value=1)
+    return g.op("Slice", input, starts, ends, axes, steps)
+
+
+@_onnx_symbolic("aten::slice")
+def slice(g: jit_utils.GraphContext, self, *args):
+    if len(args) == 4:
+        # aten::slice(Tensor self, int dim, int? start=None, int? end=None, int step=1) -> Tensor
+        dims, start, end, step = args
+    elif len(args) == 3:
+        # aten::slice(t[] l, int? start=None, int? end=None, int step=1) -> t[]
+        start, end, step = args
+        dims = [0]
+    else:
+        raise errors.SymbolicValueError("Unknown aten::slice signature", self)
+
+    return symbolic_helper._slice_helper(
+        g,
+        self,
+        axes=dims,
+        starts=start,
+        ends=end,
+        steps=step,
+    )
+
+
+@_onnx_symbolic("aten::flip")
+@symbolic_helper.parse_args("v", "is")
+def flip(g: jit_utils.GraphContext, input, dims):
+    return symbolic_helper._slice_helper(
+        g,
+        input,
+        axes=dims,
+        starts=[-1] * len(dims),
+        ends=[-_constants.INT64_MAX] * len(dims),
+        steps=[-1] * len(dims),
+    )
+
+
+@_onnx_symbolic("aten::fmod")
+def fmod(g: jit_utils.GraphContext, input, other):
+    return g.op("Mod", input, other, fmod_i=1)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with scale_grad_by_freq for training mode"
+        )
+    if padding_idx is not None and padding_idx >= 0:
+        raise RuntimeError("embedding_bag with padding_idx")
+
+    warnings.warn(
+        "Export of embedding_bag with dynamic input/offsets shape is not supported in opset 10. "
+        "Please use opset 11 or higher to export model for dynamic input shape.'"
+    )
+    offsets_dim_0 = symbolic_helper._get_tensor_dim_size(offsets, 0)
+    if offsets_dim_0 is not None:
+        if include_last_offset:
+            offset_len = offsets_dim_0 - 1
+            offsets_extended = offsets
+        else:
+            offset_len = offsets_dim_0
+            offsets_extended = [
+                offsets,
+                g.op("Constant", value_t=torch.tensor([sys.maxsize])),
+            ]
+            offsets_extended = g.op("Concat", *offsets_extended, axis_i=0)
+        list_ = []
+        for i in range(offset_len):
+            start_ = symbolic_helper._unsqueeze_helper(
+                g,
+                opset9.select(g, offsets_extended, torch.tensor(0), torch.tensor(i)),
+                [0],
+            )
+            end_ = symbolic_helper._unsqueeze_helper(
+                g,
+                opset9.select(
+                    g, offsets_extended, torch.tensor(0), torch.tensor(i + 1)
+                ),
+                [0],
+            )
+            axes_ = g.op("Constant", value_t=torch.tensor([0]))
+            indices_row = g.op("Slice", indices, start_, end_, axes_)
+
+            embeddings = g.op("Gather", embedding_matrix, indices_row)
+            if not symbolic_helper._is_none(per_sample_weights):
+                per_sample_weights_row = g.op(
+                    "Slice", per_sample_weights, start_, end_, axes_
+                )
+                per_sample_weights_row = symbolic_helper._unsqueeze_helper(
+                    g, per_sample_weights_row, [1]
+                )
+                embeddings = g.op("Mul", embeddings, per_sample_weights_row)
+            if mode == 0:
+                embeddings = symbolic_helper._reducesum_helper(
+                    g, embeddings, axes_i=[0], keepdims_i=0
+                )
+            elif mode == 1:
+                embeddings = g.op("ReduceMean", embeddings, axes_i=[0], keepdims_i=0)
+            else:
+                embeddings = g.op("ReduceMax", embeddings, axes_i=[0], keepdims_i=0)
+
+            embeddings = symbolic_helper._unsqueeze_helper(g, embeddings, [0])
+            list_.append(embeddings)
+
+        output = g.op("Concat", *list_, axis_i=0)
+        # aten::embedding_bag returns a tuple of 4 elements: output, offset2bag, bag_size, max_indices.
+        # But the last three outputs are not used in torch.nn.EmbeddingBag or torch.nn.functional.embedding_bag.
+        return output, None, None, None
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with unknown shape of offsets for opset 10 is not supported. "
+            "please use opset 11 or higher."
+        )
+
+
+@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i")
+def fake_quantize_per_tensor_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is a special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) == (0, 127):
+        symbolic_helper._onnx_opset_unsupported_detailed(
+            "fake_quantize_per_tensor_affine",
+            10,
+            13,
+            "Quantize range (0, 127) not supported, requires opset 13 Clip",
+            inputs,
+        )
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127)]:
+        raise errors.SymbolicValueError(
+            f"For (quant_min, quant_max), ONNX allows only (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    scale = symbolic_helper._maybe_get_scalar(scale)
+    if scale is None:
+        symbolic_helper._onnx_opset_unsupported_detailed(
+            "fake_quantize_per_tensor_affine",
+            10,
+            13,
+            "Non-constant scale not supported",
+            inputs,
+        )
+    scale = scale.float().data  # Avoid exporter generating double type
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    return g.op(
+        "DequantizeLinear",
+        g.op("QuantizeLinear", inputs, scale, zero_point),
+        scale,
+        zero_point,
+    )
+
+
+@_onnx_symbolic("aten::isinf")
+def isinf(g: jit_utils.GraphContext, input):
+    return g.op("IsInf", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE))
+
+
+@_onnx_symbolic("aten::isfinite")
+def isfinite(g: jit_utils.GraphContext, input):
+    inf_node = isinf(g, input)
+    nan_node = opset9.isnan(g, input)
+    return opset9.__not_(g, opset9.__or_(g, inf_node, nan_node))
+
+
+@_onnx_symbolic("aten::quantize_per_tensor")
+def quantize_per_tensor(g: jit_utils.GraphContext, input, scale, zero_point, dtype):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    # TODO(justinchuby): Extract all the cast ops into a helper function.
+    zero_point = g.op(
+        "Cast", zero_point, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+    )
+    scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return symbolic_helper.quantize_helper(g, input, scale, zero_point)
+
+
+@_onnx_symbolic("aten::dequantize")
+def dequantize(g: jit_utils.GraphContext, input):
+    return symbolic_helper.dequantize_helper(g, input)[0]
+
+
+@_onnx_symbolic("aten::nan_to_num")
+@symbolic_helper.parse_args("v", "f", "f", "f")
+def nan_to_num(g: jit_utils.GraphContext, input, nan, posinf, neginf):
+    # Cannot create a int type tensor with inf/nan values, so we simply
+    # return the original tensor
+    if not symbolic_helper._is_fp(input):
+        return input
+    input_dtype = _type_utils.JitScalarType.from_value(input).dtype()
+    if nan is None:
+        nan = 0.0
+    nan_cond = opset9.isnan(g, input)
+    nan_result = g.op(
+        "Where",
+        nan_cond,
+        g.op("Constant", value_t=torch.tensor([nan], dtype=input_dtype)),
+        input,
+    )
+
+    # For None values of posinf, neginf we use the greatest/lowest finite
+    # value representable by input's dtype.
+    finfo = torch.finfo(input_dtype)
+    if posinf is None:
+        posinf = finfo.max
+    posinf_cond = opset9.logical_and(
+        g,
+        isinf(g, nan_result),
+        opset9.gt(g, nan_result, g.op("Constant", value_t=torch.LongTensor([0]))),
+    )
+    nan_posinf_result = g.op(
+        "Where",
+        posinf_cond,
+        g.op("Constant", value_t=torch.tensor([posinf], dtype=input_dtype)),
+        nan_result,
+    )
+
+    if neginf is None:
+        neginf = finfo.min
+    neginf_cond = opset9.logical_and(
+        g,
+        isinf(g, nan_posinf_result),
+        opset9.lt(
+            g, nan_posinf_result, g.op("Constant", value_t=torch.LongTensor([0]))
+        ),
+    )
+    return g.op(
+        "Where",
+        neginf_cond,
+        g.op("Constant", value_t=torch.tensor([neginf], dtype=input_dtype)),
+        nan_posinf_result,
+    )
+
+
+# Quantized symbolics ---------------------------------------------------------
+# https://github.com/pytorch/pytorch/wiki/PyTorch-ONNX-exporter#quantized-model-export
+# Support starts from opset 10 because `DequantizeLinear` and `QuantizeLinear` were
+# introduced in opset version 10.
+@_onnx_symbolic("quantized::linear")
+def quantized_linear(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::linear_relu")
+def quantized_linear_relu(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::add")
+def quantized_add(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.add(g, x, y)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::add_relu")
+def quantized_add_relu(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.add(g, x, y)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::mul")
+def quantized_mul(g: jit_utils.GraphContext, x, y, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+    y, _, _, _ = symbolic_helper.dequantize_helper(g, y)
+
+    output = opset9.mul(g, x, y)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::hardswish")
+def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.hardswish(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::sigmoid")
+def quantized_sigmoid(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.sigmoid(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::leaky_relu")
+def quantized_leaky_relu(
+    g: jit_utils.GraphContext, x, negative_slope, inplace, op_scale, op_zero_point
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.leaky_relu(g, x, negative_slope, inplace)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::layer_norm")
+def quantized_layer_norm(
+    g: jit_utils.GraphContext,
+    x,
+    normalized_shape,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.layer_norm(g, x, normalized_shape, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::group_norm")
+def quantized_group_norm(
+    g: jit_utils.GraphContext,
+    x,
+    num_groups,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = opset9.group_norm(g, x, num_groups, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::instance_norm")
+@symbolic_helper.parse_args("v", "v", "v", "f", "v", "v")
+def quantized_instance_norm(
+    g: jit_utils.GraphContext,
+    q_input,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    input, _, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+
+    output = opset9.instance_norm(
+        g, input, weight, bias, None, None, False, 0.0, eps, False
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d_relu")
+def quantized_conv1d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d_relu")
+def quantized_conv2d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d_relu")
+def quantized_conv3d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d")
+def quantized_conv1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d")
+def quantized_conv2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d")
+def quantized_conv3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose1d")
+def quantized_conv_transpose1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose2d")
+def quantized_conv_transpose2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose3d")
+def quantized_conv_transpose3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, _ = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(g, bias, input_scale, weight_scale)
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose3d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::cat")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def quantized_cat(
+    g: jit_utils.GraphContext,
+    q_inputs: _C.Value,
+    dim: int,
+    op_scale: _C.Value,
+    op_zero_point: _C.Value,
+) -> _C.Value:
+    unpacked_inputs = symbolic_helper._unpack_list(q_inputs)
+    dequantized = [
+        symbolic_helper.dequantize_helper(g, input)[0] for input in unpacked_inputs
+    ]
+    concatenated = g.op("Concat", *dequantized, axis_i=dim)
+    return symbolic_helper.quantize_helper(g, concatenated, op_scale, op_zero_point)
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset11.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset11.py
new file mode 100644
index 0000000000000000000000000000000000000000..a879d30cf210a39530d3ec4d7718b60001cee91e
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset11.py
@@ -0,0 +1,1467 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 11."""
+
+from __future__ import annotations
+
+import functools
+import sys
+import warnings
+from typing import Sequence
+
+import torch
+from torch import _C
+from torch._C import _onnx as _C_onnx
+from torch.onnx import (
+    _type_utils,
+    errors,
+    symbolic_helper,
+    symbolic_opset10 as opset10,
+    symbolic_opset9 as opset9,
+    utils,
+)
+from torch.onnx._internal import jit_utils, registration
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = [
+    "add",
+    "append",
+    "arange",
+    "argsort",
+    "atleast_1d",
+    "atleast_2d",
+    "atleast_3d",
+    "cat",
+    "chunk",
+    "clamp_max",
+    "clamp_min",
+    "clamp",
+    "constant_pad_nd",
+    "cumsum",
+    "Delete",
+    "embedding_bag",
+    "embedding_renorm",
+    "flatten",
+    "gather",
+    "hardtanh",
+    "hstack",
+    "im2col",
+    "index_fill",
+    "index",
+    "index_copy",
+    "index_put",
+    "insert",
+    "linalg_det",
+    "linalg_vector_norm",
+    "logdet",
+    "masked_scatter",
+    "masked_select",
+    "mm",
+    "narrow",
+    "normal",
+    "pad",
+    "pixel_shuffle",
+    "pop",
+    "prim_constant_chunk",
+    "reflection_pad",
+    "relu6",
+    "remainder",
+    "replication_pad",
+    "round",
+    "scatter",
+    "select",
+    "size",
+    "sort",
+    "split_with_sizes",
+    "split",
+    "squeeze",
+    "stack",
+    "topk",
+    "unbind",
+    "unique_dim",
+    "unsqueeze",
+    "vstack",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=11)
+
+
+@_onnx_symbolic("aten::hardtanh")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "f")
+def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    min_val = g.op(
+        "Constant",
+        value_t=torch.tensor(min_val, dtype=scalar_type.dtype()),
+    )
+    max_val = g.op(
+        "Constant",
+        value_t=torch.tensor(max_val, dtype=scalar_type.dtype()),
+    )
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Clip", self, min_val, max_val, opset_before=12
+    )
+
+
+@_onnx_symbolic("aten::clamp")
+def clamp(g: jit_utils.GraphContext, self, min, max):
+    def _cast_if_not_none(tensor, dtype):
+        if tensor is not None and not symbolic_helper._is_none(tensor):
+            return g.op(
+                "Cast",
+                tensor,
+                to_i=dtype.onnx_type(),
+            )
+        else:
+            return tensor
+
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        min = _cast_if_not_none(min, scalar_type)
+        max = _cast_if_not_none(max, scalar_type)
+
+    if symbolic_helper._is_none(min):
+        return clamp_max(g, self, max)
+    elif symbolic_helper._is_none(max):
+        return clamp_min(g, self, min)
+    else:
+        if (
+            symbolic_helper._get_tensor_rank(min) == 0
+            and symbolic_helper._get_tensor_rank(max) == 0
+        ):
+            return symbolic_helper._op_with_optional_float_cast(
+                g, "Clip", self, min, max, opset_before=12
+            )
+        else:
+            return clamp_max(g, clamp_min(g, self, min), max)
+
+
+@_onnx_symbolic("aten::clamp_min")
+@symbolic_helper.parse_args("v", "v")
+def clamp_min(g: jit_utils.GraphContext, self, min):
+    min = g.op("Cast", min, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
+    if symbolic_helper._get_tensor_rank(min) == 0:
+        max = opset9.unused(g)
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, min, max, opset_before=12
+        )
+    else:
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Max", self, min, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::clamp_max")
+@symbolic_helper.parse_args("v", "v")
+def clamp_max(g: jit_utils.GraphContext, self, max):
+    max = g.op("Cast", max, to_i=_type_utils.JitScalarType.from_value(self).onnx_type())
+    if symbolic_helper._get_tensor_rank(max) == 0:
+        min = opset9.unused(g)
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, min, max, opset_before=12
+        )
+    else:
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Min", self, max, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::relu6")
+def relu6(g: jit_utils.GraphContext, input):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.FLOAT
+    )
+    min_val = g.op(
+        "Constant",
+        value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+    )
+    max_val = g.op(
+        "Constant",
+        value_t=torch.tensor(6, dtype=scalar_type.dtype()),
+    )
+    return clamp(g, input, min_val, max_val)
+
+
+@_onnx_symbolic("aten::select")
+# Opset 11 gather accepts negative indices
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "v")
+def select(g: jit_utils.GraphContext, self, dim, index):
+    return g.op("Gather", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::index_put")
+def index_put(
+    g: jit_utils.GraphContext, self, indices_list_value, values, accumulate=False
+):
+    if symbolic_helper._is_packed_list(indices_list_value):
+        indices_list = symbolic_helper._unpack_list(indices_list_value)
+    else:
+        indices_list = [indices_list_value]
+    accumulate = symbolic_helper._parse_arg(accumulate, "b")
+
+    if len(indices_list) == 0:
+        return values
+
+    if len(indices_list) > 1:
+        for idx_ in range(len(indices_list)):
+            if symbolic_helper._is_bool(indices_list[idx_]):
+                indices_list[idx_] = g.op("NonZero", indices_list[idx_])
+        index = indices_list[0]
+
+        for ind in indices_list[1:]:
+            index = opset9.add(g, index, ind)
+        broadcast_index_shape = g.op("Shape", index)
+        indices_list = [
+            symbolic_helper._unsqueeze_helper(
+                g, opset9.expand(g, ind, broadcast_index_shape, None), [-1]
+            )
+            for ind in indices_list
+        ]
+        index = g.op("Concat", *indices_list, axis_i=-1)
+    else:
+        # Replace index_put node with masked_scatter or masked_fill
+        # when inputs to the index_put node contains a single boolean input.
+        #
+        # index_put -> masked_fill
+        #   * input index contains single tensor of Bool type (e.g.: %24 <- %23).
+        #   * input value contains single element (e.g.: %18).
+        #
+        # Torch IR
+        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
+        #   %16 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
+        #               aten::to(%8, %26, %27, %11, %12, %28, %29, %15)
+        #   %18 : Float(requires_grad=0, device=cpu) = prim::Constant[value={1}]()
+        #   %23 : Bool(8, strides=[1], device=cpu) = aten::view(%16, %22)
+        #   %24 : Tensor?[] = prim::ListConstruct(%23)
+        #   %25 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) =
+        #                aten::index_put(%mask, %24, %18, %30)
+        #   return (%25)
+        #
+        #
+        # index_put -> masked_scatter
+        #   * input index contains single tensor of Bool type (e.g.: %32 <- %31).
+        #   * input value contains multiple elements (e.g.: %28).
+        #
+        # Torch IR
+        #   %mask : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu) = aten::clone(%0, %6)
+        #   %28 : Float(8, strides=[1], requires_grad=0, device=cpu)
+        #                = prim::Constant[value= 1  1  1  1  1  1  1  1 [ CPUFloatType{8} ]]()
+        #   %15 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #                = aten::ne(%mask, %some_const)
+        #   %23 : Bool(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #                = aten::to(%15, %34, %35, %18, %19, %36, %37, %22)
+        #   %38 : Long(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
+        #   %30 : int[] = prim::Constant[value=[-1]]()
+        #   %31 : Bool(8, strides=[1], device=cpu) = aten::view(%23, %30)
+        #   %32 : Tensor?[] = prim::ListConstruct(%31)
+        #   %33 : Float(2, 2, 2, strides=[4, 2, 1], requires_grad=0, device=cpu)
+        #               = aten::index_put(%mask, %32, %28, %38)
+        #   return (%33)
+        index = indices_list[0]
+        bool_inp = index
+        if symbolic_helper._is_bool(bool_inp):
+            rank = symbolic_helper._get_tensor_rank(values)
+            if rank is not None and rank == 0:
+                return opset9.masked_fill(g, self, bool_inp, values)
+            mask_rank = symbolic_helper._get_tensor_rank(bool_inp)
+            self_rank = symbolic_helper._get_tensor_rank(self)
+            if (
+                mask_rank is not None
+                and self_rank is not None
+                and self_rank > mask_rank
+            ):
+                # Unsqueeze 'bool_inp' to be broadcastable to shape of 'self'.
+                bool_inp = symbolic_helper._unsqueeze_helper(
+                    g, bool_inp, list(range(mask_rank, self_rank))
+                )
+            return masked_scatter(g, self, bool_inp, values)
+        broadcast_index_shape = g.op("Shape", index)
+        index = symbolic_helper._unsqueeze_helper(g, index, [-1])
+    sub_data_shape = symbolic_helper._slice_helper(
+        g, g.op("Shape", self), axes=[0], starts=[len(indices_list)], ends=[sys.maxsize]
+    )
+    values_shape = g.op("Concat", broadcast_index_shape, sub_data_shape, axis_i=0)
+    # Check if values is a singular value and expand accordingly
+    rank = symbolic_helper._get_tensor_rank(values)
+    if rank is not None and rank == 0:
+        values = opset9.expand(g, values, values_shape, None)
+    values = symbolic_helper._reshape_helper(g, values, values_shape)
+
+    self_scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if self_scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        values_scalar_type = _type_utils.JitScalarType.from_value(
+            values, _type_utils.JitScalarType.UNDEFINED
+        )
+        if self_scalar_type != values_scalar_type:
+            values = g.op("Cast", values, to_i=self_scalar_type.onnx_type())
+    elif accumulate:
+        raise errors.SymbolicValueError("self does not have a valid scalar type.", self)
+
+    if accumulate:
+        zeros = g.op(
+            "ConstantOfShape",
+            g.op("Shape", self),
+            value_t=torch.tensor([0], dtype=self_scalar_type.dtype()),
+        )
+        result = g.op("ScatterND", zeros, index, values)
+        result = add(g, self, result)
+    else:
+        result = g.op("ScatterND", self, index, values)
+
+    return result
+
+
+@_onnx_symbolic("aten::pixel_shuffle")
+@symbolic_helper.parse_args("v", "i")
+def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is not None and rank != 4:
+        return symbolic_helper._unimplemented("pixel_shuffle", "only support 4d input")
+    return g.op("DepthToSpace", self, blocksize_i=upscale_factor, mode_s="CRD")
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bicubic2d",
+    decorate=[symbolic_helper._apply_params("upsample_bicubic2d", 4, "cubic")],
+)
+def _interpolate(name: str, dim: int, interpolate_mode: str):
+    return symbolic_helper._interpolate_helper(name, dim, interpolate_mode)
+
+
+@_onnx_symbolic("aten::__interpolate")
+@symbolic_helper.quantized_args(True, False, False, False, False, False, False)
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    return symbolic_helper.__interpolate_helper(
+        g, input, size, scale_factor, mode, align_corners, recompute_scale_factor
+    )
+
+
+@_onnx_symbolic("aten::gather")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
+    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
+        return symbolic_helper._unimplemented("gather", "sparse_grad == True")
+    return g.op("GatherElements", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::scatter")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter(g: jit_utils.GraphContext, self, dim, index, src):
+    src_type = _type_utils.JitScalarType.from_value(src)
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("ScatterElements", self, index, src, axis_i=dim)
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        if _type_utils.JitScalarType.from_value(self) != src_type:
+            src = g.op(
+                "Cast",
+                src,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+        return g.op(
+            "ScatterElements", self, index, opset9.expand_as(g, src, index), axis_i=dim
+        )
+
+
+@_onnx_symbolic("aten::cumsum")
+@symbolic_helper.parse_args("v", "i", "none")
+def cumsum(g: jit_utils.GraphContext, self, dim, dtype=None):
+    dim_tensor = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.int))
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        cast = g.op(
+            "Cast", self, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    else:
+        cast = self
+    csum = g.op("CumSum", cast, dim_tensor)
+    return csum
+
+
+@_onnx_symbolic("aten::masked_select")
+def masked_select(g: jit_utils.GraphContext, self, mask):
+    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
+    return g.op("GatherND", self, index)
+
+
+@_onnx_symbolic("aten::masked_scatter")
+def masked_scatter(g: jit_utils.GraphContext, self, mask, source):
+    index = opset9.nonzero(g, opset9.expand_as(g, mask, self))
+    # NOTE: source can have more elements than needed.
+    # It could also have arbitrary shape.
+    # This is not supported by ONNX::ScatterND, so we need to flatten and slice source tensor.
+    source = symbolic_helper._reshape_helper(g, source, torch.LongTensor([-1]))
+    source = symbolic_helper._slice_helper(
+        g,
+        source,
+        axes=torch.LongTensor([0]),
+        starts=torch.LongTensor([0]),
+        ends=opset9.size(g, index, torch.LongTensor([0])),
+    )
+    return g.op("ScatterND", self, index, source)
+
+
+@_onnx_symbolic("aten::len")
+def _len(g: jit_utils.GraphContext, self):
+    if (
+        symbolic_helper._is_tensor_list(self)
+        or self.node().kind() == "onnx::SplitToSequence"
+    ):
+        return g.op("SequenceLength", self)
+    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
+    return symbolic_helper._squeeze_helper(g, sz_0, [0])
+
+
+@_onnx_symbolic("aten::__getitem_")
+def __getitem_(g: jit_utils.GraphContext, self, i):
+    if symbolic_helper._is_tensor_list(self):
+        # SequenceAt requires that the input be a List of Tensors
+        return g.op("SequenceAt", self, i)
+    else:
+        from torch.onnx.symbolic_opset9 import __getitem_ as getitem
+
+        return getitem(g, self, i)
+
+
+@_onnx_symbolic("aten::_set_item")
+def _set_item(g: jit_utils.GraphContext, tensor_list, i, v):
+    tensor_list = g.op("SequenceErase", tensor_list, i)
+    return g.op("SequenceInsert", tensor_list, v, i)
+
+
+@_onnx_symbolic("aten::append")
+def append(g: jit_utils.GraphContext, self, tensor):
+    return g.op("SequenceInsert", self, tensor)
+
+
+@_onnx_symbolic("aten::add")
+def add(g: jit_utils.GraphContext, self, other, alpha=None):
+    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
+        tensor_list_node = other.node()
+        if tensor_list_node.kind() != "prim::ListConstruct":
+            return symbolic_helper._unimplemented(
+                "add", "does not support adding dynamic tensor list to another"
+            )
+        tensors = symbolic_helper._unpack_list(other)
+        l = self
+        for t in tensors:
+            l = g.op("SequenceInsert", l, t)
+        return l
+
+    return opset9.add(g, self, other, alpha)
+
+
+@_onnx_symbolic("aten::insert")
+def insert(g: jit_utils.GraphContext, self, pos, tensor):
+    return g.op("SequenceInsert", self, tensor, pos)
+
+
+@_onnx_symbolic("aten::pop")
+def pop(g: jit_utils.GraphContext, tensor_list, dim):
+    return g.op("SequenceErase", tensor_list, dim)
+
+
+@_onnx_symbolic("aten::Delete")
+def Delete(g: jit_utils.GraphContext, tensor_list, dim):
+    return g.op("SequenceErase", tensor_list, dim)
+
+
+@_onnx_symbolic("aten::cat")
+@symbolic_helper.quantized_args(True)
+def cat(g: jit_utils.GraphContext, tensor_list, dim):
+    if symbolic_helper._is_packed_list(tensor_list):
+        return opset9.cat(g, tensor_list, dim)
+    else:
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        return g.op("ConcatFromSequence", tensor_list, axis_i=dim)
+
+
+@_onnx_symbolic("aten::stack")
+def stack(g: jit_utils.GraphContext, tensor_list, dim):
+    if symbolic_helper._is_packed_list(tensor_list):
+        return opset9.stack(g, tensor_list, dim)
+    else:
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        return g.op("ConcatFromSequence", tensor_list, axis_i=dim, new_axis_i=1)
+
+
+@_onnx_symbolic("aten::_unique2")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def _unique2(g: jit_utils.GraphContext, self, sorted, return_inverse, return_counts):
+    u, indices, inverse_indices, counts = g.op(
+        "Unique", self, sorted_i=sorted, outputs=4
+    )
+    return u, inverse_indices, counts
+
+
+@_onnx_symbolic("aten::unique_dim")
+@symbolic_helper.parse_args("v", "i", "i", "i", "i")
+def unique_dim(
+    g: jit_utils.GraphContext, self, dim, sorted, return_inverse, return_counts
+):
+    u, indices, inverse_indices, counts = g.op(
+        "Unique", self, axis_i=dim, sorted_i=sorted, outputs=4
+    )
+    return u, inverse_indices, counts
+
+
+@_onnx_symbolic("aten::topk")
+@symbolic_helper.parse_args("v", "v", "i", "i", "i", "none")
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    return symbolic_helper._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out
+    )
+
+
+@_onnx_symbolic("aten::sort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def sort(g: jit_utils.GraphContext, self, dim, decending, out=None):
+    return symbolic_helper._sort_helper(g, self, dim, decending=decending, out=out)
+
+
+@_onnx_symbolic("aten::argsort")
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def argsort(g: jit_utils.GraphContext, self, dim, decending, out=None):
+    _, indices = symbolic_helper._sort_helper(
+        g, self, dim, decending=decending, out=out
+    )
+    return indices
+
+
+@_onnx_symbolic("aten::round")
+@symbolic_helper.parse_args("v", "i")
+def round(g: jit_utils.GraphContext, self, decimals=0):
+    if not symbolic_helper._is_fp(self):
+        return self
+    if decimals == 0:
+        return g.op("Round", self)
+    mul = g.op("Mul", self, g.op("Constant", value_t=torch.tensor(pow(10, decimals))))
+    round = g.op("Round", mul)
+    return g.op(
+        "Mul", round, g.op("Constant", value_t=torch.tensor(pow(10, -1 * decimals)))
+    )
+
+
+@_onnx_symbolic("aten::remainder")
+def remainder(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_fp(input) or symbolic_helper._is_fp(other):
+        return opset9.remainder(g, input, other)
+    return g.op("Mod", input, other, fmod_i=0)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
+        if _outputs is None:
+            return split_out
+        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
+        if (
+            symbolic_helper._is_packed_list(split_size_or_sizes)
+            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
+        ):
+            split_sizes = [
+                symbolic_helper._unsqueeze_helper(g, v, [0])
+                for v in symbolic_helper._unpack_list(split_size_or_sizes)
+            ]
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            res = []
+            for i in range(_outputs):
+                end = g.op(
+                    "Add", start, split_sizes[i]
+                )  # split_sizes is a list of same length as _outputs
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+            return res
+        return [
+            g.op(
+                "SequenceAt",
+                split_out,
+                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+            )
+            for i in range(_outputs)
+        ]
+    else:
+        return opset9.split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    return split(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+    else:
+        return opset9.unbind(g, self, dim, _outputs)
+
+
+def _prepare_onnx_paddings(g: jit_utils.GraphContext, input, pad):
+    """Generate paddings in ONNX order based on pad in pytorch.
+
+    Args:
+        input: the input tensor.
+        pad: the paddings in pytorch.
+            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ..., dim_m_begin, dim_m_end,
+            where m is in range [0, n].
+    """
+    if (
+        not symbolic_helper._is_packed_list(pad)
+        and symbolic_helper._is_list(pad)
+        and symbolic_helper._is_scalar_list(pad)
+    ):
+        pad = g.op("ConcatFromSequence", pad, axis_i=0, new_axis_i=1)
+    # The desired order of paddings is
+    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
+    # n is the dimension of input.
+    # Assume zero-dimensions in the beginning, pad the "pad" sequence with zeros in the beginning
+    pad_len = opset9.size(g, pad, g.op("Constant", value_t=torch.tensor([0])))
+    # Set extension = [0] * (dim * 2 - len(pad))
+    rank = symbolic_helper._get_tensor_rank(input)
+    if rank is None:
+        rank = g.op("Size", g.op("Shape", input))
+    else:
+        rank = g.op("Constant", value_t=torch.tensor(rank, dtype=torch.int64))
+    extension = g.op(
+        "Sub",
+        g.op("Mul", rank, g.op("Constant", value_t=torch.tensor(2, dtype=torch.int64))),
+        pad_len,
+    )
+    # Concat pad with extension: paddings = [dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, 0, 0, ... ]
+    # Currently ONNX only supports int64 type for Pad
+    pad = g.op("Cast", pad, to_i=_C_onnx.TensorProtoDataType.INT64)
+    paddings = g.op(
+        "Concat",
+        pad,
+        g.op(
+            "ConstantOfShape", extension, value_t=torch.tensor([0], dtype=torch.int64)
+        ),
+        axis_i=0,
+    )
+    # Reshape and reverse order and collate first beginnings and then ends
+    # paddings = [[..., 0, dim_n-1_begin, dim_n_begin],
+    #               [..., 0, dim_n-1_end, dim_n_end]]
+    # Reshape back to 1-D paddings = [..., 0, dim_n - 1_begin, dim_n_begin, ..., 0, dim_n - 1_end, dim_n_end]
+    paddings = symbolic_helper._reshape_helper(
+        g, paddings, g.op("Constant", value_t=torch.tensor([-1, 2]))
+    )
+    paddings = g.op("Transpose", opset10.flip(g, paddings, [0]), perm_i=[1, 0])
+    paddings = symbolic_helper._reshape_helper(
+        g, paddings, g.op("Constant", value_t=torch.tensor([-1]))
+    )
+    padding_c = g.op("Cast", paddings, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return padding_c
+
+
+@_onnx_symbolic("aten::constant_pad_nd")
+def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value=None):
+    mode = "constant"
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, input)
+    pad = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, pad, value, mode_s=mode)
+
+
+@_onnx_symbolic("aten::reflection_pad1d")
+@_onnx_symbolic("aten::reflection_pad2d")
+@_onnx_symbolic("aten::reflection_pad3d")
+def reflection_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "reflect"
+    paddings = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, paddings, mode_s=mode)
+
+
+@_onnx_symbolic("aten::replication_pad1d")
+@_onnx_symbolic("aten::replication_pad2d")
+@_onnx_symbolic("aten::replication_pad3d")
+def replication_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "edge"
+    paddings = _prepare_onnx_paddings(g, input, padding)
+    return g.op("Pad", input, paddings, mode_s=mode)
+
+
+@_onnx_symbolic("aten::pad")
+def pad(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    pad: _C.Value,
+    mode: _C.Value,
+    value: _C.Value,
+):
+    mode = symbolic_helper._parse_arg(mode, "s")
+    if mode == "replicate":
+        return replication_pad(g, input, pad)
+    elif mode == "reflect":
+        return reflection_pad(g, input, pad)
+    elif mode == "constant":
+        return constant_pad_nd(g, input, pad, value)
+    elif mode == "circular":
+        return opset9._pad_circular(g, input, pad)
+    else:
+        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
+
+
+@_onnx_symbolic("aten::linalg_det")
+def linalg_det(g: jit_utils.GraphContext, self):
+    return g.op("Det", self)
+
+
+@_onnx_symbolic("aten::logdet")
+def logdet(g: jit_utils.GraphContext, input):
+    return opset9.log(g, linalg_det(g, input))
+
+
+@_onnx_symbolic("aten::arange")
+def arange(g: jit_utils.GraphContext, *args):
+    def _get_arange_dtype(dtype):
+        dtype = symbolic_helper._maybe_get_const(dtype, "i")
+        return dtype
+
+    if len(args) == 2 and all(isinstance(val, int) for val in args):
+        # aten::arange(Scalar start, Scalar end)
+        dtype = torch.int64
+        # Start index.
+        start = g.op(
+            "Constant",
+            value_t=torch.tensor(args[0], dtype=dtype),
+        )
+        # End (exclusive) index.
+        end = g.op(
+            "Constant",
+            value_t=torch.tensor(args[1], dtype=dtype),
+        )
+        # Step size from start to end indexes.
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=dtype),
+        )
+        return g.op("Range", start, end, delta_default)
+    elif len(args) == 2 or len(args) == 5:
+        if len(args) == 2:
+            # aten::arange(Scalar end, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[1])
+        type_, end, start, step = symbolic_helper._arange_cast_helper(
+            g, end=args[0], dtype=dtype
+        )
+        start_default = g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=type_.dtype()),
+        )
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=type_.dtype()),
+        )
+        return g.op("Range", start_default, end, delta_default)
+    elif len(args) == 4 or len(args) == 7:
+        if len(args) == 4:
+            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[3])
+        _, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], step=args[2], dtype=dtype
+        )
+        return g.op("Range", start, end, step)
+    elif len(args) == 6:
+        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+        dtype = _get_arange_dtype(args[2])
+        type_, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], dtype=dtype
+        )
+        delta_default = g.op(
+            "Constant",
+            value_t=torch.tensor(1, dtype=type_.dtype()),
+        )
+        return g.op("Range", start, end, delta_default)
+    else:
+        return symbolic_helper._unimplemented(
+            "aten::arange", f"with {len(args)} arguments"
+        )
+
+
+@_onnx_symbolic("aten::_dim_arange")
+@symbolic_helper.parse_args("v", "i")
+def _dim_arange(g: jit_utils.GraphContext, like, dim):
+    like_shape = g.op("Shape", like)
+    stop = g.op(
+        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
+    )
+    return arange(g, stop, 4, None, None, None)
+
+
+@_onnx_symbolic("aten::size")
+@symbolic_helper.quantized_args(True, quantize_output=False)
+def size(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Shape", self)
+    return symbolic_helper._size_helper(g, self, dim)
+
+
+@_onnx_symbolic("aten::squeeze")
+def squeeze(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Squeeze", self)
+
+    # dim as a tensor
+    if not symbolic_helper._is_constant(dim):
+        return symbolic_helper._squeeze_helper(g, self, [dim])
+
+    dim = symbolic_helper._get_const(dim, "i", "dim")
+
+    input_rank = symbolic_helper._get_tensor_rank(self)
+    adjusted_dim = dim
+    if input_rank is not None and dim < 0:
+        adjusted_dim += input_rank
+    dim_size = symbolic_helper._get_tensor_dim_size(self, adjusted_dim)
+    if (dim < 0 and input_rank is None) or dim_size is None:
+        # If onnx shape inference is not on, export always as dynamic.
+        # Because we cannot tell if observed static shape is also static at runtime.
+        # create "cond" node (condition is shape[i]==1)
+        dim_constant = g.op("Constant", value_t=torch.tensor([dim]))
+        size = symbolic_helper._size_helper(g, self, dim_constant)
+        const_one = g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))
+        cond = g.op("Equal", size, const_one)
+        # create the "If" node and add the "then" and "else" blocks to it.
+        if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+            g, "If", cond, n_blocks=2
+        )
+        squeeze_ = symbolic_helper._squeeze_helper(if_context, self, [dim])
+        utils._add_output_to_block(if_context.block, squeeze_)
+        identity_ = else_context.op("Identity", self)
+        utils._add_output_to_block(else_context.block, identity_)
+        return if_op
+
+    # For static input shape
+    dim = adjusted_dim
+    if dim_size > 1:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(dim)
+            + ". The size of "
+            + "this dimension in the given input is "
+            + str(dim_size)
+            + ". The model will "
+            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
+            + "input shapes, please export with dynamic_axes argument."
+        )
+        return self
+    return symbolic_helper._squeeze_helper(g, self, [dim])
+
+
+@_onnx_symbolic("aten::unsqueeze")
+def unsqueeze(g: jit_utils.GraphContext, self, dim):
+    if symbolic_helper._is_constant(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+
+    return symbolic_helper._unsqueeze_helper(g, self, [dim])
+
+
+@_onnx_symbolic("aten::mm")
+def mm(g: jit_utils.GraphContext, self, other):
+    return g.op("Gemm", self, other, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::index")
+def index(g: jit_utils.GraphContext, self, index):
+    if symbolic_helper._is_packed_list(index):
+        indices = symbolic_helper._unpack_list(index)
+    else:
+        indices = [index]
+
+    # Handle single mask index.
+    if len(indices) == 1:
+        index = indices[0]
+        if not symbolic_helper._is_none(index) and (
+            symbolic_helper._is_bool(index)
+            or _type_utils.JitScalarType.from_value(index)
+            == _type_utils.JitScalarType.UINT8
+        ):
+            index = opset9.nonzero(g, index)
+            return g.op("GatherND", self, index)
+    return opset9.index(g, self, index)
+
+
+@_onnx_symbolic("aten::index_fill")
+def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
+    dim_value = symbolic_helper._parse_arg(dim, "i")
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, self)
+    expanded_value = opset9.expand(g, value, expanded_index_shape, None)
+    return scatter(g, self, dim, expanded_index, expanded_value)
+
+
+@_onnx_symbolic("aten::index_copy")
+def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
+    dim_value = symbolic_helper._parse_arg(dim, "i")
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    return scatter(g, self, dim, expanded_index, source)
+
+
+@_onnx_symbolic("aten::bitwise_right_shift")
+@_onnx_symbolic("aten::__rshift_")
+def __rshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(self):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+        )
+
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.UINT8
+    ):
+        return g.op("BitShift", self, other, direction_s="RIGHT")
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+    )
+    rshift = g.op("Div", self, two_pow)
+    return rshift
+
+
+@_onnx_symbolic("aten::bitwise_left_shift")
+@_onnx_symbolic("aten::__lshift_")
+def __lshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(self):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+        )
+
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.UINT8
+    ):
+        return g.op("BitShift", self, other, direction_s="LEFT")
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+    )
+    lshift = g.op("Mul", self, two_pow)
+    return lshift
+
+
+def _get_im2col_indices_along_dim(
+    g: jit_utils.GraphContext, input_d, kernel_size_d, dilation_d, padding_d, stride_d
+):
+    # Input is always 4-D (N, C, H, W)
+    # Calculate indices of sliding blocks along spatial dimension
+    # Slide kernel over input each dim d:
+    # each dimension d ranges from 0 to input[d]+2xpadding[d]-dilation[d]x(kernel_size[d]-1)
+    # with steps = stride
+
+    blocks_d = g.op(
+        "Add", input_d, g.op("Constant", value_t=torch.tensor(padding_d * 2))
+    )
+    blocks_d = g.op(
+        "Sub",
+        blocks_d,
+        g.op("Constant", value_t=torch.tensor(dilation_d * (kernel_size_d - 1))),
+    )
+
+    # Stride kernel over input and find starting indices along dim d
+    blocks_d_indices = g.op(
+        "Range",
+        g.op("Constant", value_t=torch.tensor(0)),
+        blocks_d,
+        g.op("Constant", value_t=torch.tensor(stride_d)),
+    )
+
+    # Apply dilation on kernel and find its indices along dim d
+    kernel_grid = torch.arange(0, kernel_size_d * dilation_d, dilation_d)
+    kernel_grid = g.op("Constant", value_t=kernel_grid.unsqueeze(0))
+
+    # Broadcast and add kernel staring positions (indices) with
+    # kernel_grid along dim d, to get block indices along dim d
+    blocks_d_indices = symbolic_helper._unsqueeze_helper(
+        g, blocks_d_indices, [0]
+    )  # Reshape to [1, -1]
+    kernel_mask = symbolic_helper._reshape_helper(
+        g, kernel_grid, g.op("Constant", value_t=torch.tensor([-1, 1]))
+    )
+    block_mask = g.op("Add", blocks_d_indices, kernel_mask)
+
+    return block_mask
+
+
+def _get_im2col_padded_input(g: jit_utils.GraphContext, input, padding_h, padding_w):
+    # Input is always 4-D tensor (N, C, H, W)
+    # Padding tensor has the following format: (padding_h, padding_w)
+    # Reshape the padding to follow ONNX format: (dim1_begin, dim2_begin,...,dim1_end, dim2_end,...)
+    pad = g.op("Constant", value_t=torch.LongTensor([0, 0, padding_h, padding_w] * 2))
+    return g.op("Pad", input, pad)
+
+
+def _get_im2col_output_shape(g: jit_utils.GraphContext, input, kernel_h, kernel_w):
+    batch_dim = size(g, input, g.op("Constant", value_t=torch.tensor(0)))
+    channel_dim = size(g, input, g.op("Constant", value_t=torch.tensor(1)))
+    channel_unfolded = g.op(
+        "Mul", channel_dim, g.op("Constant", value_t=torch.tensor(kernel_h * kernel_w))
+    )
+
+    return g.op(
+        "Concat",
+        symbolic_helper._unsqueeze_helper(g, batch_dim, [0]),
+        symbolic_helper._unsqueeze_helper(g, channel_unfolded, [0]),
+        g.op("Constant", value_t=torch.tensor([-1])),
+        axis_i=0,
+    )
+
+
+@_onnx_symbolic("aten::im2col")
+@symbolic_helper.parse_args("v", "is", "is", "is", "is")
+def im2col(g: jit_utils.GraphContext, input, kernel_size, dilation, padding, stride):
+    # Input is always 4-D tensor (N, C, H, W)
+    # All other args are int[2]
+
+    input_h = size(g, input, g.op("Constant", value_t=torch.tensor(2)))
+    input_w = size(g, input, g.op("Constant", value_t=torch.tensor(3)))
+
+    stride_h, stride_w = stride[0], stride[1]
+    padding_h, padding_w = padding[0], padding[1]
+    dilation_h, dilation_w = dilation[0], dilation[1]
+    kernel_h, kernel_w = kernel_size[0], kernel_size[1]
+
+    blocks_row_indices = _get_im2col_indices_along_dim(
+        g, input_h, kernel_h, dilation_h, padding_h, stride_h
+    )
+    blocks_col_indices = _get_im2col_indices_along_dim(
+        g, input_w, kernel_w, dilation_w, padding_w, stride_w
+    )
+
+    output_shape = _get_im2col_output_shape(g, input, kernel_h, kernel_w)
+    padded_input = _get_im2col_padded_input(g, input, padding_h, padding_w)
+
+    # For a 4D matrix of size (1, 1, 3, 3) as below with kernel_size=2, stride=1, and dilation=1
+    # [[[[1., 2., 3.,],
+    #    [4., 5., 6.,],
+    #    [7., 8., 9.,]]]]
+    # First gather indices along rows (dim=2) with blocks_row_indices = [[0,1], [1,2]] to get:
+    # [[[[[1., 2., 3.],
+    #     [4., 5., 6.]],
+    #    [[4., 5., 6.],
+    #     [7., 8., 9.]]]]]
+    # And then gather along cols (dim=4) with blocks_row_indices = [[0,1], [1,2]] to get:
+    # [[[[[[1., 2.],
+    #      [4., 5.]],
+    #     [[2., 3.],
+    #      [5., 6]]],
+    #    [[[4., 5.],
+    #      [7., 8.]],
+    #     [[5., 6.],
+    #      [8., 9.]]]]]]
+    # Transpose dims 3 (depth) and 4 (rows), and then reshape to output shape (1, 1, 4, 4) to get:
+    #  [[[1., 2., 4., 5.],
+    #    [2., 3., 5., 6.],
+    #    [4., 5., 7., 8.],
+    #    [5., 6., 8., 9.]]]
+    output = g.op("Gather", padded_input, blocks_row_indices, axis_i=2)
+    output = g.op("Gather", output, blocks_col_indices, axis_i=4)
+    output = g.op("Transpose", output, perm_i=[0, 1, 2, 4, 3, 5])
+    return symbolic_helper._reshape_helper(g, output, output_shape)
+
+
+@_onnx_symbolic("aten::narrow")
+def narrow(g: jit_utils.GraphContext, input, dim, start, length):
+    end = g.op("Add", start, length)
+    return symbolic_helper._slice_helper(g, input, axes=dim, starts=start, ends=end)
+
+
+@_onnx_symbolic("aten::flatten")
+@symbolic_helper.quantized_args(True, False, False)
+@symbolic_helper.parse_args("v", "i", "i")
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    dim = symbolic_helper._get_tensor_rank(input)
+    if dim == 1:
+        return input
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim == 1:
+        if end_dim == -1 or (dim is not None and end_dim == dim - 1):
+            return g.op("Flatten", input, axis_i=start_dim)
+    elif start_dim == 0:
+        if end_dim == -2 or (dim is not None and end_dim == dim - 2):
+            return g.op("Flatten", input, axis_i=end_dim + 1)
+    if dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+    # if end_dim is negative add dim
+    if end_dim < 0:
+        end_dim = dim + end_dim
+
+    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self,
+    ord,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype,
+):
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    return symbolic_helper._embedding_bag_helper(
+        g,
+        embedding_matrix,
+        indices,
+        offsets,
+        scale_grad_by_freq,
+        mode,
+        sparse,
+        per_sample_weights,
+        include_last_offset,
+        padding_idx,
+    )
+
+
+@_onnx_symbolic("aten::embedding_renorm")
+@symbolic_helper.parse_args("v", "v", "f", "f")
+def embedding_renorm(g: jit_utils.GraphContext, weight, indices, max_norm, norm_type):
+    unique_indices = g.op("Unique", indices)
+    partial_weight = g.op("Gather", weight, unique_indices)
+    norm_i = int(norm_type)
+    if norm_i == 1:
+        norm_type = "ReduceL1"
+    elif norm_i == 2:
+        norm_type = "ReduceL2"
+    else:
+        raise errors.SymbolicValueError(
+            f"Unsupported: ONNX export of embedding_renorm with norm: {norm_i}. "
+            "Only 1. and 2. are supported.",
+            weight,
+        )
+    partial_weight_norm = g.op(norm_type, partial_weight, axes_i=[1], keepdims_i=1)
+    # https://github.com/pytorch/pytorch/blob/0a07488ed2c47765e337e290bd138c0e6e459cbd/aten/src/ATen/native/Embedding.cpp#L177
+    # Add 1e-7 to prevent division by zero.
+    partial_weight_norm_ = g.op(
+        "Add", partial_weight_norm, g.op("Constant", value_t=torch.tensor(1e-7))
+    )
+    max_norm = torch.tensor(max_norm)
+    scales = g.op("Div", max_norm, partial_weight_norm_)
+    partial_weight_renorm = g.op("Mul", partial_weight, scales)
+    partial_weight_renorm = g.op(
+        "Where",
+        g.op("Greater", partial_weight_norm, max_norm),
+        partial_weight_renorm,
+        partial_weight,
+    )
+    return g.op(
+        "ScatterND",
+        weight,
+        symbolic_helper._unsqueeze_helper(g, unique_indices, [1]),
+        partial_weight_renorm,
+    )
+
+
+@_onnx_symbolic("aten::chunk")
+def chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    # Calculate chunk size for dynamic chunk
+    dim_size = g.op("Gather", g.op("Shape", self), dim, axis_i=0)
+    chunk_size_s = g.op(
+        "Sub", chunks, g.op("Constant", value_t=torch.tensor([1], dtype=torch.long))
+    )
+    chunk_size = g.op("Div", g.op("Add", dim_size, chunk_size_s), chunks)
+    # Create splits vector
+    chunk_vec = [
+        opset9.expand(g, chunk_size, chunk_size_s, None),
+        g.op("Sub", dim_size, g.op("Mul", chunk_size, chunk_size_s)),
+    ]
+    chunk_vec = g.op("Concat", *chunk_vec, axis_i=0)
+    return split(g, self, chunk_vec, dim)
+
+
+@_onnx_symbolic("aten::normal")
+def normal(
+    g: jit_utils.GraphContext,
+    mean,
+    std,
+    sizes=None,
+    generator=None,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+):
+    # If you can sample from a given distribution with mean 0 and variance 1, then you can easily sample from a
+    # scale-location transformation of that distribution, which has mean mu and variance sigma's square. If x is a sample
+    # from a mean 0 and variance 1 distribution then
+    #       sigma x+mu
+    # is a sample with mean mu and variance sigma's square.
+    if sizes is not None and not symbolic_helper._is_none(sizes):
+        mean = opset9.expand(g, mean, sizes, None)
+    result = opset9.mul(g, std, g.op("RandomNormalLike", mean))
+    return add(g, result, mean)
+
+
+@_onnx_symbolic("aten::atleast_1d")
+def atleast_1d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 1D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1]))
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1]))
+        )
+    return self
+
+
+@_onnx_symbolic("aten::atleast_2d")
+def atleast_2d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 2D
+    #       If it's 1D, unsqueeze to 2D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1]))
+                )
+            elif tensor_rank == 1:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[0]
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1, 1]))
+        )
+    elif tensor_rank == 1:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
+    return self
+
+
+@_onnx_symbolic("aten::atleast_3d")
+def atleast_3d(g: jit_utils.GraphContext, self: torch._C.Value):
+    # NOTE: If it's 0D, reshape to 3D
+    #       If it's 1D, unsqueeze to 3D
+    #       If it's 2D, unsqueeze to 3D
+
+    # NOTE: self could be a packed list or a tensor
+    if symbolic_helper._is_value(self) and symbolic_helper._is_packed_list(self):
+        tensor_list = symbolic_helper._unpack_list(self)
+        new_tensor_list = []
+        for tensor in tensor_list:
+            new_tensor = tensor
+            tensor_rank = symbolic_helper._get_tensor_rank(tensor)
+            if tensor_rank == 0:
+                new_tensor = symbolic_helper._reshape_helper(
+                    g, new_tensor, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
+                )
+            elif tensor_rank == 1:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[0]
+                )
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[-1]
+                )
+            elif tensor_rank == 2:
+                new_tensor = symbolic_helper._unsqueeze_helper(
+                    g, new_tensor, axes_i=[-1]
+                )
+            new_tensor_list.append(new_tensor)
+        return g.op("SequenceConstruct", *new_tensor_list)
+
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    if tensor_rank == 0:
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([1, 1, 1]))
+        )
+    elif tensor_rank == 1:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[0])
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
+    elif tensor_rank == 2:
+        self = symbolic_helper._unsqueeze_helper(g, self, axes_i=[-1])
+    return self
+
+
+@_onnx_symbolic("prim::ConstantChunk")
+def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    input_shape = g.op("Shape", self)
+    axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+    input_shape_dim = g.op("Gather", input_shape, axis, axis_i=0)
+    start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+    chunk_size = g.op("Constant", value_t=torch.tensor([chunks], dtype=torch.long))
+    chunk_size_minus_1 = g.op(
+        "Constant", value_t=torch.tensor([chunks - 1], dtype=torch.long)
+    )
+    input_shape_dim_shift = g.op("Add", input_shape_dim, chunk_size_minus_1)
+    chunk_dim = g.op("Div", input_shape_dim_shift, chunk_size)
+    res = []
+    for i in range(chunks):
+        index = g.op("Constant", value_t=torch.tensor([i + 1], dtype=torch.long))
+        end = g.op("Mul", chunk_dim, index)
+        res.append(g.op("Slice", self, start, end, axis))
+        start = end
+    return res
+
+
+@_onnx_symbolic("aten::hstack")
+def hstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
+    tensor_list = atleast_1d(g, tensor_list)
+    first_tensor = g.op(
+        "SequenceAt",
+        tensor_list,
+        g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)),
+    )
+    first_tensor_shape = g.op("Shape", first_tensor)
+    first_tensor_dim = g.op("Size", first_tensor_shape)
+
+    const_one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
+    equal_to_one = g.op("Equal", first_tensor_dim, const_one)
+
+    (
+        if_op_greater,
+        (if_context_equal, else_context_equal),
+        _,
+    ) = jit_utils.add_op_with_blocks(g, "If", equal_to_one, n_blocks=2, outputs=1)
+    result_if = if_context_equal.op(
+        "ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0
+    )
+    utils._add_output_to_block(if_context_equal.block, result_if)
+    result_else = else_context_equal.op(
+        "ConcatFromSequence", tensor_list, axis_i=1, new_axis_i=0
+    )
+    utils._add_output_to_block(else_context_equal.block, result_else)
+    result = if_op_greater.node().output()
+
+    return result
+
+
+@_onnx_symbolic("aten::vstack")
+def vstack(g: jit_utils.GraphContext, tensor_list: _C.Value):
+    tensor_list = atleast_2d(g, tensor_list)
+    return g.op("ConcatFromSequence", tensor_list, axis_i=0, new_axis_i=0)
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset12.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset12.py
new file mode 100644
index 0000000000000000000000000000000000000000..fff2a205419302c1e3d1ae3336f3514d5255743b
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset12.py
@@ -0,0 +1,465 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+from __future__ import annotations
+
+import functools
+import sys
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch.onnx import (
+    _type_utils,
+    errors,
+    symbolic_helper,
+    symbolic_opset9 as opset9,
+    utils,
+)
+from torch.onnx._internal import jit_utils, registration
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 12
+
+__all__ = [
+    "argmax",
+    "argmin",
+    "binary_cross_entropy_with_logits",
+    "celu",
+    "cross_entropy_loss",
+    "dropout",
+    "einsum",
+    "ge",
+    "le",
+    "native_dropout",
+    "nll_loss",
+    "nll_loss2d",
+    "nll_loss_nd",
+    "outer",
+    "pow",
+    "tensordot",
+    "unfold",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=12)
+
+
+def _einsum_helper(g: jit_utils.GraphContext, equation, tensors):
+    if not tensors:
+        raise RuntimeError("Einsum inputs are empty.")
+    # ONNX does not support bool for Einsum inputs.
+    if symbolic_helper._is_bool(tensors[0]):
+        tensors = [
+            g.op("Cast", tensor, to_i=_C_onnx.TensorProtoDataType.INT64)
+            for tensor in tensors
+        ]
+        return g.op(
+            "Cast",
+            g.op("Einsum", *tensors, equation_s=equation),
+            to_i=_C_onnx.TensorProtoDataType.BOOL,
+        )
+    else:
+        return g.op("Einsum", *tensors, equation_s=equation)
+
+
+@_onnx_symbolic("aten::einsum")
+@symbolic_helper.parse_args("s", "v", "is")
+def einsum(g: jit_utils.GraphContext, equation, tensor_list, path=None):
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    return _einsum_helper(g, equation, tensors)
+
+
+@_onnx_symbolic("aten::outer")
+@symbolic_helper.parse_args("v", "v")
+def outer(g: jit_utils.GraphContext, input, other):
+    # make sure to cast other to self's type
+    if _type_utils.JitScalarType.from_value(
+        other, _type_utils.JitScalarType.UNDEFINED
+    ) != _type_utils.JitScalarType.from_value(input):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=_type_utils.JitScalarType.from_value(input).onnx_type(),
+        )
+    return _einsum_helper(g, "i,j->ij", [input, other])
+
+
+def _dropout_returns_masked_input_and_mask(
+    g: jit_utils.GraphContext, input: torch._C.Value, p: float, train: bool
+) -> tuple[torch._C.Value, torch._C.Value | None]:
+    symbolic_helper.check_training_mode(train, "dropout")
+    # In eval mode, dropout is non-op. That is, if the node's
+    # train param is set to False, dropout just returns its inputs.
+    if not train:
+        return input, None
+    p = g.op("Constant", value_t=torch.tensor(p))
+    t = g.op("Constant", value_t=torch.tensor(train, dtype=torch.bool))
+    r, mask = g.op("Dropout", input, p, t, outputs=2)
+    return r, mask
+
+
+@_onnx_symbolic("aten::dropout")
+@symbolic_helper.parse_args("v", "f", "b")
+def dropout(g: jit_utils.GraphContext, input, p, train):
+    masked, _ = _dropout_returns_masked_input_and_mask(g, input, p, train)
+    return masked
+
+
+@_onnx_symbolic("aten::native_dropout")
+@symbolic_helper.parse_args("v", "f", "b")
+def native_dropout(g: jit_utils.GraphContext, input, p, train):
+    return _dropout_returns_masked_input_and_mask(g, input, p, train)
+
+
+@_onnx_symbolic("aten::nll_loss")
+def nll_loss(g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index):
+    # none reduction : onnx::Constant[value={0}]
+    # mean reduction : onnx::Constant[value={1}]
+    # sum reduction : onnx::Constant[value={2}]
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    reduction_vals = ["none", "mean", "sum"]
+    reduction = reduction_vals[reduction]
+
+    # in onnx NegativeLogLikelihoodLoss specification, ignore_index is optional without default value.
+    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
+    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
+    if weight.node().mustBeNone():
+        nllloss = g.op(
+            "NegativeLogLikelihoodLoss",
+            self,
+            target,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+    else:
+        nllloss = g.op(
+            "NegativeLogLikelihoodLoss",
+            self,
+            target,
+            weight,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+
+    return nllloss
+
+
+@_onnx_symbolic("aten::nll_loss2d")
+def nll_loss2d(
+    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
+):
+    return nll_loss(g, self, target, weight, reduction, ignore_index)
+
+
+@_onnx_symbolic("aten::nll_loss_nd")
+def nll_loss_nd(
+    g: jit_utils.GraphContext, self, target, weight, reduction, ignore_index
+):
+    return nll_loss(g, self, target, weight, reduction, ignore_index)
+
+
+@_onnx_symbolic("aten::cross_entropy_loss")
+def cross_entropy_loss(
+    g: jit_utils.GraphContext,
+    self,
+    target,
+    weight,
+    reduction,
+    ignore_index,
+    label_smoothing,
+):
+    # none reduction : onnx::Constant[value={0}]
+    # mean reduction : onnx::Constant[value={1}]
+    # sum reduction : onnx::Constant[value={2}]
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    reduction_vals = ["none", "mean", "sum"]
+    reduction = reduction_vals[reduction]
+
+    label_smoothing = symbolic_helper._maybe_get_const(label_smoothing, "f")
+    if label_smoothing is not None and label_smoothing > 0.0:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX does not support label_smoothing", self
+        )
+
+    # in onnx SoftmaxCrossEntropyLoss specification, ignore_index is optional without default value.
+    # therefore we need to set ignore_index attribute even if it is not specified (e.g. ignore_index=-100).
+    ignore_index = symbolic_helper._maybe_get_const(ignore_index, "i")
+    if weight.node().mustBeNone():
+        celoss = g.op(
+            "SoftmaxCrossEntropyLoss",
+            self,
+            target,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+    else:
+        celoss = g.op(
+            "SoftmaxCrossEntropyLoss",
+            self,
+            target,
+            weight,
+            reduction_s=reduction,
+            ignore_index_i=ignore_index,
+        )
+
+    return celoss
+
+
+@_onnx_symbolic("aten::binary_cross_entropy_with_logits")
+@symbolic_helper.parse_args("v", "v", "v", "v", "i")
+def binary_cross_entropy_with_logits(
+    g: jit_utils.GraphContext, input, target, weight, pos_weight, reduction
+):
+    p = g.op("Constant", value_t=torch.tensor([1]))
+    sig_x = opset9.sigmoid(g, input)
+    log_sig_x = opset9.log(g, sig_x)
+    sub_1_x = opset9.sub(g, p, sig_x)
+    sub_1_y = opset9.sub(g, p, target)
+    log_1_x = opset9.log(g, sub_1_x)
+    if pos_weight is None or symbolic_helper._is_none(pos_weight):
+        output = opset9.neg(
+            g,
+            opset9.add(
+                g, opset9.mul(g, target, log_sig_x), opset9.mul(g, sub_1_y, log_1_x)
+            ),
+        )
+    else:
+        output = opset9.neg(
+            g,
+            opset9.add(
+                g,
+                opset9.mul(g, opset9.mul(g, target, log_sig_x), pos_weight),
+                opset9.mul(g, sub_1_y, log_1_x),
+            ),
+        )
+
+    if weight is not None and not symbolic_helper._is_none(weight):
+        output = opset9.mul(g, weight, output)
+
+    reduction = symbolic_helper._maybe_get_const(reduction, "i")
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return g.op("ReduceSum", output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "binary_cross_entropy_with_logits with reduction other than none, mean, or sum",
+            input,
+        )
+
+
+@_onnx_symbolic("aten::celu")
+def celu(g: jit_utils.GraphContext, self, alpha):
+    alpha = symbolic_helper._maybe_get_const(alpha, "f")
+    # if the input is of type double cast it to float
+    if (
+        _type_utils.JitScalarType.from_value(self, _type_utils.JitScalarType.UNDEFINED)
+        == _type_utils.JitScalarType.DOUBLE
+    ):
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        out = g.op("Celu", self, alpha_f=alpha)
+        return g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
+
+    return g.op("Celu", self, alpha_f=alpha)
+
+
+@_onnx_symbolic("aten::argmax")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmax(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
+
+
+@_onnx_symbolic("aten::argmin")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmin(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
+
+
+@_onnx_symbolic("aten::pow")
+def pow(g: jit_utils.GraphContext, self, exponent):
+    return g.op("Pow", self, exponent)
+
+
+@_onnx_symbolic("aten::ge")
+def ge(g: jit_utils.GraphContext, input, other):
+    return g.op("GreaterOrEqual", input, other)
+
+
+@_onnx_symbolic("aten::le")
+def le(g: jit_utils.GraphContext, input, other):
+    return g.op("LessOrEqual", input, other)
+
+
+@_onnx_symbolic("aten::unfold")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
+    const_size = symbolic_helper._maybe_get_const(size, "i")
+    const_step = symbolic_helper._maybe_get_const(step, "i")
+    if not symbolic_helper._is_value(const_size) and not symbolic_helper._is_value(
+        const_step
+    ):
+        return opset9.unfold(g, input, dimension, const_size, const_step)
+
+    sizedim = symbolic_helper._get_tensor_dim_size(input, dimension)
+    if sizedim is not None:
+        low_start = g.op("Constant", value_t=torch.tensor(0))
+        low_end = g.op("Constant", value_t=torch.tensor(sizedim))
+        hi_end = g.op("Constant", value_t=torch.tensor(sizedim + 1))
+        low_indices = g.op("Range", low_start, low_end, step)
+        hi_indices = g.op("Range", size, hi_end, step)
+
+        low_size = symbolic_helper._size_helper(
+            g, low_indices, g.op("Constant", value_t=torch.tensor(0))
+        )
+        hi_size = symbolic_helper._size_helper(
+            g, hi_indices, g.op("Constant", value_t=torch.tensor(0))
+        )
+
+        ndim = symbolic_helper._get_tensor_rank(input)
+        assert ndim is not None
+        perm = list(range(0, ndim))
+        perm.append(perm.pop(dimension))
+
+        unsqueeze_list = []
+        loop_condition = g.op("Constant", value_t=torch.tensor(1))
+        loop_condition = g.op(
+            "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+        )
+        loop_len = g.op("Min", low_size, hi_size)
+
+        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+            g, "Loop", loop_len, loop_condition, n_blocks=1
+        )
+
+        loop_block = loop_context.block
+        block_input_iter = utils._add_input_to_block(loop_block)
+        # FIXME(justinchuby): cond is unused?
+        cond = utils._add_input_to_block(loop_block)
+
+        starts = loop_context.op("Gather", low_indices, block_input_iter)
+        ends = loop_context.op("Gather", hi_indices, block_input_iter)
+        axes = loop_context.op("Constant", value_t=torch.tensor([2]))
+        starts = symbolic_helper._unsqueeze_helper(loop_context, starts, [0])
+        ends = symbolic_helper._unsqueeze_helper(loop_context, ends, [0])
+        stack = loop_context.op("Slice", input, starts, ends, axes)
+
+        unsqueeze = symbolic_helper._unsqueeze_helper(
+            loop_context, loop_context.op("Transpose", stack, perm_i=perm), [dimension]
+        )
+        unsqueeze_list.append(unsqueeze)
+        concat = loop_context.op("Concat", *unsqueeze_list, axis_i=0)
+
+        cond_out = loop_context.op(
+            "Cast", loop_condition, _C_onnx.TensorProtoDataType.BOOL
+        )
+        utils._add_output_to_block(loop_block, cond_out)
+        utils._add_output_to_block(loop_block, concat)
+
+        loop_output = loop.node().output()
+        perm = [0, 1, 2, 3, 4]
+        perm[0], perm[dimension + 1] = perm[dimension + 1], perm[0]
+        transpose = g.op("Transpose", loop_output, perm_i=perm)
+        squeeze = symbolic_helper._squeeze_helper(g, transpose, [0])
+
+        return squeeze
+
+    return symbolic_helper._unimplemented("Unfold", "input size not accessible")
+
+
+@_onnx_symbolic("aten::tensordot")
+@symbolic_helper.parse_args("v", "v", "is", "is", "v")
+def tensordot(g: jit_utils.GraphContext, input_a, input_b, dims_a, dims_b, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "Tensordot", "Out parameter is not supported for tensordot."
+        )
+
+    dim_count_a = symbolic_helper._get_tensor_rank(input_a)
+    if dim_count_a is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of tensordot for tensor(input_a) of unknown rank.",
+            input_a,
+        )
+
+    dim_count_b = symbolic_helper._get_tensor_rank(input_b)
+    if dim_count_b is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of tensordot for tensor(input_b) of unknown rank.",
+            input_b,
+        )
+
+    dims_a = [
+        (dims_a[i] + dim_count_a) if (dims_a[i] < 0) else dims_a[i]
+        for i in range(len(dims_a))
+    ]
+    dims_b = [
+        (dims_b[i] + dim_count_b) if (dims_b[i] < 0) else dims_b[i]
+        for i in range(len(dims_b))
+    ]
+
+    left_dims_a = [i for i in range(dim_count_a) if (i not in dims_a)]
+    left_dims_b = [i for i in range(dim_count_b) if (i not in dims_b)]
+
+    new_input_a = opset9.permute(g, input_a, left_dims_a + dims_a)
+    new_input_b = opset9.permute(g, input_b, dims_b + left_dims_b)
+
+    input_shape = g.op("Shape", new_input_a)
+    left_sizes_a = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[0], ends=[len(left_dims_a)]
+    )
+    shape_sizes = [
+        left_sizes_a,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+    ]
+    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
+
+    input_shape = g.op("Shape", output_a)
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
+    )
+    shape_sizes = [
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+        slices,
+    ]
+    output_a = opset9._reshape_from_tensor(g, new_input_a, shape_sizes)
+
+    input_shape = g.op("Shape", new_input_b)
+    left_sizes_b = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[len(dims_b)], ends=[sys.maxsize]
+    )
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[0], ends=[len(dims_b)]
+    )
+    shape_sizes = [
+        slices,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+    ]
+    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
+
+    input_shape = g.op("Shape", output_b)
+    slices = symbolic_helper._slice_helper(
+        g, input_shape, axes=[0], starts=[-1], ends=[sys.maxsize]
+    )
+    shape_sizes = [
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.long)),
+        slices,
+    ]
+    output_b = opset9._reshape_from_tensor(g, new_input_b, shape_sizes)
+
+    output = einsum(g, "ij,jk->ik", g.op("prim::ListConstruct", *[output_a, output_b]))
+
+    shape_sizes = [left_sizes_a, left_sizes_b]
+    return opset9._reshape_from_tensor(g, output, shape_sizes)
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset13.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset13.py
new file mode 100644
index 0000000000000000000000000000000000000000..5137cbb7a682bc353c3a78b0eeafcb6668fae202
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset13.py
@@ -0,0 +1,1113 @@
+# mypy: allow-untyped-defs
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+# This file exports ONNX ops for opset 13
+import functools
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch.onnx import (
+    _constants,
+    _type_utils,
+    errors,
+    symbolic_helper,
+    symbolic_opset11 as opset11,
+    symbolic_opset9 as opset9,
+    utils,
+)
+from torch.onnx._internal import jit_utils, registration
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=13)
+
+
+@_onnx_symbolic("aten::softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    softmax = g.op("Softmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        softmax = g.op(
+            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+
+    return softmax
+
+
+@_onnx_symbolic("aten::log_softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    return_op = g.op("LogSoftmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return_op = g.op(
+            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    return return_op
+
+
+@_onnx_symbolic("aten::frobenius_norm")
+@symbolic_helper.parse_args("v", "v", "i")
+def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
+    dim_val = symbolic_helper._maybe_get_const(dim, "is")
+    if not symbolic_helper._is_value(dim_val) and len(dim_val) == 0:
+        return g.op("ReduceL2", self, keepdims_i=0)
+    sqr = g.op("Mul", self, self)
+    sumsqr = symbolic_helper._reducesum_helper(g, sqr, dim, keepdims_i=keepdim)
+    return g.op("Sqrt", sumsqr)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        split_out = g.op("SplitToSequence", self, split_size_or_sizes, axis_i=dim)
+        if _outputs is None:
+            return split_out
+        # Convert to multiple slice nodes iff number of splits and number of outputs are statically known.
+        if (
+            symbolic_helper._is_packed_list(split_size_or_sizes)
+            and len(symbolic_helper._unpack_list(split_size_or_sizes)) == _outputs
+        ):
+            split_sizes = [
+                symbolic_helper._unsqueeze_helper(g, v, [0])
+                for v in symbolic_helper._unpack_list(split_size_or_sizes)
+            ]
+
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            axis = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+            res = []
+            for i in range(_outputs):
+                end = g.op(
+                    "Add", start, split_sizes[i]
+                )  # split_sizes is a list of same length as _outputs
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+            return res
+        return [
+            g.op(
+                "SequenceAt",
+                split_out,
+                g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+            )
+            for i in range(_outputs)
+        ]
+
+    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
+    if split_val.dim() > 0:
+        return g.op("Split", self, split_size_or_sizes, axis_i=dim, outputs=_outputs)
+    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        if _outputs is not None:
+            size = split_size * _outputs
+        else:
+            raise errors.SymbolicValueError(
+                "Unknown dimension size not supported", self
+            )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    splits = g.op("Constant", value_t=torch.tensor(splits))
+    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    return split(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split")
+def unsafe_split(
+    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
+):
+    return split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split_with_sizes")
+def unsafe_split_with_sizes(
+    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
+):
+    return split_with_sizes(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::tensor_split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def tensor_split(
+    g: jit_utils.GraphContext, self, indices_or_sections, dim, _outputs=None
+):
+    axis = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    axis = opset11.unsqueeze(g, axis, 0)
+    const_1 = g.op("Constant", value_t=torch.tensor(1, dtype=torch.long))
+
+    if symbolic_helper._is_split_static(indices_or_sections, _outputs):
+        split_val = symbolic_helper._node_get(indices_or_sections.node(), "value")
+
+        if split_val.dim() > 0:
+            start = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+            res = []
+            assert _outputs is not None
+            for i in range(_outputs - 1):
+                end = g.op(
+                    "Gather",
+                    indices_or_sections,
+                    g.op("Constant", value_t=torch.tensor([i], dtype=torch.long)),
+                    axis_i=0,
+                )
+                res.append(g.op("Slice", self, start, end, axis))
+                start = end
+
+            end = symbolic_helper._size_helper(g, self, axis)
+            res.append(g.op("Slice", self, start, end, axis))
+            return res
+
+        split_size = symbolic_helper._get_const(
+            indices_or_sections, "i", "indices_or_sections"
+        )
+
+        size = symbolic_helper._get_tensor_dim_size(self, dim)
+        if size is None:
+            if _outputs is not None:
+                size = split_size * _outputs
+            else:
+                raise errors.SymbolicValueError(
+                    "Unknown dimension size not supported", self
+                )
+
+        min_split_size = size // split_size
+        num_splits_one_extra = size % split_size
+
+        splits = num_splits_one_extra * [min_split_size + 1]
+        leftover = (split_size - num_splits_one_extra) * [min_split_size]
+
+        splits = g.op(
+            "Constant", value_t=torch.tensor(splits + leftover, dtype=torch.long)
+        )
+        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+    if (
+        symbolic_helper._is_tensor(indices_or_sections)
+        and symbolic_helper._get_tensor_rank(indices_or_sections) == 1
+    ):
+        loop_len = symbolic_helper._size_helper(
+            g, indices_or_sections, g.op("Constant", value_t=torch.tensor(0))
+        )
+        loop_len = opset11.unsqueeze(g, loop_len, 0)
+        loop_condition = g.op("Cast", const_1, to_i=_C_onnx.TensorProtoDataType.BOOL)
+
+        # To make the first slice in the below loop work,
+        # we pad a zero to the first position so that it will be the initial start of slice.
+        padding_0 = g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))
+        indices_or_sections = g.op("Concat", padding_0, indices_or_sections, axis_i=0)
+
+        final_splits = g.op("SequenceEmpty")
+        # Loop inputs
+        loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+            g, "Loop", loop_len, loop_condition, final_splits, outputs=1, n_blocks=1
+        )
+
+        loop_block = loop_context.block
+        block_input_iter = utils._add_input_to_block(loop_block)
+        cond = utils._add_input_to_block(loop_block)
+        final_splits = utils._add_input_to_block(loop_block)
+
+        start = loop_context.op(
+            "Gather", indices_or_sections, block_input_iter, axis_i=0
+        )
+        end = loop_context.op(
+            "Gather",
+            indices_or_sections,
+            loop_context.op("Add", block_input_iter, const_1),
+            axis_i=0,
+        )
+
+        slice = loop_context.op("Slice", self, start, end, axis)
+        final_splits = loop_context.op("SequenceInsert", final_splits, slice)
+
+        # Loop outputs
+        cond_out = loop_context.op("Identity", loop_condition)
+        utils._add_output_to_block(loop_block, cond_out)
+        utils._add_output_to_block(loop_block, final_splits)
+
+        loop_out = loop.node().output()
+        start = g.op(
+            "Gather",
+            indices_or_sections,
+            g.op("Constant", value_t=torch.tensor(-1, dtype=torch.long)),
+            axis_i=0,
+        )
+        start = opset11.unsqueeze(g, start, 0)
+        end = symbolic_helper._size_helper(g, self, axis)
+
+        last_slice = g.op("Slice", self, start, end, axis)
+
+        return g.op("SequenceInsert", loop_out, last_slice)
+
+    else:  # scalar tensor
+        dim_size = symbolic_helper._size_helper(g, self, axis)
+        min_split_size = g.op("Div", dim_size, indices_or_sections)
+        min_split_size_plus_1 = g.op(
+            "Add",
+            min_split_size,
+            const_1,
+        )
+        num_splits_one_extra = g.op("Mod", dim_size, indices_or_sections)
+        splits = g.op("Tile", min_split_size_plus_1, num_splits_one_extra)
+        leftover = g.op(
+            "Tile",
+            min_split_size,
+            g.op(
+                "Sub",
+                opset11.unsqueeze(g, indices_or_sections, 0),
+                num_splits_one_extra,
+            ),
+        )
+
+        splits = g.op("Concat", splits, leftover, axis_i=0)
+        if _outputs is None:
+            return g.op("SplitToSequence", self, splits, axis_i=dim)
+        return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+
+    splits = g.op("Constant", value_t=torch.tensor([1] * _outputs))
+    outputs = g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+    outputs = [outputs] if _outputs == 1 else outputs
+    squeezed_outputs = [
+        g.op("Squeeze", out, g.op("Constant", value_t=torch.tensor([dim])))
+        for out in outputs
+    ]
+    return squeezed_outputs
+
+
+@_onnx_symbolic("aten::nonzero_numpy")
+# Emitted from `torch.nonzero(x, as_tuple=True)`
+def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
+    return unbind(g, opset9.nonzero(g, input), 1, _outputs=_outputs)
+
+
+@_onnx_symbolic("aten::where")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
+    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
+    if not symbolic_helper._is_bool(condition):
+        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    if self is None:
+        condition = opset9.nonzero(g, condition)
+        return symbolic_helper._unbind_helper(
+            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
+        )
+    return g.op("Where", condition, self, other)
+
+
+@_onnx_symbolic("aten::fake_quantize_per_channel_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i")
+def fake_quantize_per_channel_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    axis,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
+        raise errors.SymbolicValueError(
+            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    # ONNX defines zero_point to be int8 or uint8
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    quantized = g.op("QuantizeLinear", inputs, scale, zero_point, axis_i=axis)
+    if (quant_min, quant_max) == (0, 127):
+        quantized = g.op(
+            "Clip",
+            quantized,
+            opset9.unused(g),
+            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
+        )
+    return g.op("DequantizeLinear", quantized, scale, zero_point, axis_i=axis)
+
+
+@_onnx_symbolic("aten::fake_quantize_per_tensor_affine")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i")
+def fake_quantize_per_tensor_affine(
+    g: jit_utils.GraphContext,
+    inputs,
+    scale,
+    zero_point,
+    quant_min=-128,
+    quant_max=127,
+):
+    # NOTE: (0, 127) is allowed as special case. PyTorch restricts activations to be in the range (0, 127).
+    #   https://github.com/pytorch/pytorch/blob/b34b192d6b97325c9f78e5995c48c8498ede34bd/torch/ao/quantization/observer.py#L1422
+    if (quant_min, quant_max) not in [(0, 255), (-128, 127), (0, 127)]:
+        raise errors.SymbolicValueError(
+            "For (quant_min, quant_max), ONNX allows only (0, 127), (0, 255) and (-128, 127). "
+            f"Got ({quant_min}, {quant_max})",
+            inputs,
+        )
+    if quant_min == 0:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.UINT8)
+    else:
+        zero_point = g.op("Cast", zero_point, to_i=_C_onnx.TensorProtoDataType.INT8)
+    if (
+        _type_utils.JitScalarType.from_value(scale, _type_utils.JitScalarType.UNDEFINED)
+        != _type_utils.JitScalarType.FLOAT
+    ):
+        scale = g.op("Cast", scale, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    quantized = g.op("QuantizeLinear", inputs, scale, zero_point)
+    if (quant_min, quant_max) == (0, 127):
+        quantized = g.op(
+            "Clip",
+            quantized,
+            opset9.unused(g),
+            g.op("Constant", value_t=torch.tensor(127, dtype=torch.uint8)),
+        )
+    return g.op("DequantizeLinear", quantized, scale, zero_point)
+
+
+def _reduce_op_symbolic(onnx_op_name):
+    def symbolic(g, self, dim=None, keepdim=None):
+        self = symbolic_helper._maybe_cast_reduce_op_input(g, self)
+        if dim is None:
+            # all-reduce path
+            return symbolic_helper._handle_reduce_dim_none(g, self, onnx_op_name)
+        else:
+            keepdim = symbolic_helper._get_const(keepdim, "i", "keepdim")
+            return g.op(onnx_op_name, self, dim, keepdims_i=keepdim)
+
+    return symbolic
+
+
+@_onnx_symbolic(
+    "aten::sum",
+    decorate=[symbolic_helper._apply_params("ReduceSum", "sum")],
+)
+def _reduce_with_dtype(onnx_op, name):
+    symbolic = _reduce_op_symbolic(onnx_op)
+
+    @symbolic_helper._overload_by_arg_count
+    def reduce(g, *args, **kwargs):
+        @symbolic_helper.parse_args("v", "none")
+        def reduce_nodim(g, self, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return symbolic_helper._unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        @symbolic_helper.parse_args("v", "v", "i", "none")
+        def reduce_dim(g, self, dim, keepdim, dtype):
+            dtype_onnx = None
+            if dtype.node().kind() == "onnx::Constant":
+                dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+                dtype_onnx = _type_utils.JitScalarType(dtype).onnx_type()
+                self = g.op("Cast", self, to_i=dtype_onnx)
+            elif dtype.node().kind() != "prim::Constant":
+                return symbolic_helper._unimplemented(name, "dtype", dtype)
+            result = symbolic(g, self, dim, keepdim)
+            if dtype_onnx is not None:
+                result_dtype_onnx = _type_utils.JitScalarType.from_value(
+                    result
+                ).onnx_type()
+                if result_dtype_onnx != dtype_onnx:
+                    result = g.op("Cast", result, to_i=dtype_onnx)
+            return result
+
+        return reduce_nodim, reduce_dim
+
+    return reduce
+
+
+# Ported from
+# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/core.py#L6097
+# NOTE: Supporting aten::unflatten before opset13 needs helper function to adjust ONNX op changes in Concat, Slice, ...
+@_onnx_symbolic("aten::unflatten")
+def unflatten(g: jit_utils.GraphContext, input, dim, unflattened_size):
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+
+    # dim could be negative
+    input_dim = g.op("Constant", value_t=torch.tensor([input_dim], dtype=torch.int64))
+    dim = g.op("Add", input_dim, dim)
+    dim = g.op("Mod", dim, input_dim)
+
+    input_size = g.op("Shape", input)
+
+    head_start_idx = g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64))
+    head_end_idx = g.op(
+        "Reshape", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
+    )
+    head_part_rank = g.op("Slice", input_size, head_start_idx, head_end_idx)
+
+    dim_plus_one = g.op(
+        "Add", dim, g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64))
+    )
+    tail_start_idx = g.op(
+        "Reshape",
+        dim_plus_one,
+        g.op("Constant", value_t=torch.tensor([1], dtype=torch.int64)),
+    )
+    tail_end_idx = g.op(
+        "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
+    )
+    tail_part_rank = g.op("Slice", input_size, tail_start_idx, tail_end_idx)
+
+    final_shape = g.op(
+        "Concat", head_part_rank, unflattened_size, tail_part_rank, axis_i=0
+    )
+
+    return symbolic_helper._reshape_helper(g, input, final_shape)
+
+
+@_onnx_symbolic("aten::unsafe_chunk")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
+    if _outputs is None:
+        return g.op(
+            "SplitToSequence",
+            self,
+            g.op("Constant", value_t=torch.tensor(1, dtype=torch.long)),
+            axis_i=dim,
+            keepdims_i=0,
+        )
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented("unsafe_chunk", "unknown dimension size")
+    split_size = (size + chunks - 1) // chunks
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+
+    # TODO: So far we don"t have a module using this method. We"ll keep
+    # this as a constant unless we see a request of dynamics in any
+    # user's modules.
+    splits = g.op("Constant", value_t=torch.tensor(splits, dtype=torch.long))
+    return g.op("Split", self, splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::tile")
+def tile(g: jit_utils.GraphContext, self, dims):
+    self_shape = g.op("Shape", self)
+    self_rank = g.op("Size", self_shape)
+    dims_rank = g.op("Size", dims)
+    diff = g.op("Sub", self_rank, dims_rank)
+    const_zero = g.op("Constant", value_t=torch.tensor([0]))
+
+    # 1. If dims is shorter than self.shape pad dims with 1
+    dims_shorter_than_self_shape = g.op("Greater", diff, const_zero)
+    (
+        if_op_greater,
+        (if_context_greater, else_context_greater),
+        _,
+    ) = jit_utils.add_op_with_blocks(
+        g, "If", dims_shorter_than_self_shape, n_blocks=2, outputs=1
+    )
+    const_one = if_context_greater.op("Constant", value_t=torch.LongTensor([1]))
+    diff_1d_greater = if_context_greater.op("Reshape", diff, const_one)
+    exapnd_ones_greater = if_context_greater.op("Expand", const_one, diff_1d_greater)
+    dims_ = if_context_greater.op("Concat", exapnd_ones_greater, dims, axis_i=0)
+    utils._add_output_to_block(if_context_greater.block, dims_)
+    identity_dim = else_context_greater.op("Identity", dims)
+    utils._add_output_to_block(else_context_greater.block, identity_dim)
+    dims_final = if_op_greater.node().output()
+
+    # 2. If dims is longer than self.shape pad self.shape with 1
+    dims_longer_than_self_shape = g.op("Less", diff, const_zero)
+    (
+        if_op_less,
+        (if_context_less, else_context_less),
+        _,
+    ) = jit_utils.add_op_with_blocks(
+        g, "If", dims_longer_than_self_shape, n_blocks=2, outputs=1
+    )
+    const_one = if_context_less.op("Constant", value_t=torch.LongTensor([1]))
+    diff_1d_less = if_context_less.op(
+        "Reshape",
+        if_context_less.op("Abs", diff),
+        const_one,
+    )
+    exapnd_ones_less = if_context_less.op("Expand", const_one, diff_1d_less)
+    self_final_shape = if_context_less.op(
+        "Concat", exapnd_ones_less, self_shape, axis_i=0
+    )
+    self_ = if_context_less.op("Reshape", self, self_final_shape)
+    utils._add_output_to_block(if_context_less.block, self_)
+    identity_self = else_context_less.op("Identity", self)
+    utils._add_output_to_block(else_context_less.block, identity_self)
+    self_final = if_op_less.node().output()
+
+    dims_final = g.op("Cast", dims_final, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("Tile", self_final, dims_final)
+
+
+@_onnx_symbolic("aten::repeat_interleave")
+def repeat_interleave(
+    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
+):
+    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
+    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
+    input_sizes = symbolic_helper._get_tensor_sizes(self)
+    if repeats_dim is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
+            self,
+        )
+    if repeats_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
+            self,
+        )
+    if input_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
+            self,
+        )
+
+    final_dim = dim
+    # if dim is None flatten
+    # By default, use the flattened input array, and return a flat output array
+    if symbolic_helper._is_none(dim):
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        dim = torch.tensor(0, dtype=torch.int64)
+    else:
+        dim = symbolic_helper._maybe_get_scalar(dim)
+
+    # Handle cases where dim is negative
+    if dim < 0:
+        dim += len(input_sizes)
+
+    output_sizes = input_sizes.copy()
+    for idx, input_size in enumerate(input_sizes):
+        if input_size is None:
+            output_sizes[idx], input_sizes[idx] = 0, -1
+
+    # Check if all indices should be repeated the same number of times.
+    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
+        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
+            g, self, repeats, dim
+        )
+
+    cond_dynamic_repeats = repeats_dim == 1 and repeats_sizes[0] is None
+    # If input size is dynamic or repeats vector is dynamic
+    if output_sizes[dim] == 0 or cond_dynamic_repeats:
+        reps = symbolic_helper._size_helper(g, self, dim)
+        reps = opset11.unsqueeze(g, reps, 0)
+
+        # Check if repeats is dynamic
+        # As repeats is dynamic, we use a where node as a substitute for the if statement
+        # If repests_dim = 1, expand repeats otherwise use original tensor
+        if cond_dynamic_repeats:
+            repeat_dim = symbolic_helper._size_helper(
+                g, repeats, g.op("Constant", value_t=torch.LongTensor([0]))
+            )
+            repeat_cond = g.op(
+                "Equal", repeat_dim, g.op("Constant", value_t=torch.LongTensor([1]))
+            )
+            repeats = where(g, repeat_cond, g.op("Expand", repeats, reps), repeats)
+    # There are cases when the repeats are 1-d tensor with multiple repeats, but dim
+    # provided along one of the dynamic axes provided. A simple example would be
+    # input.shape -> [1, 1, *] where * represents the dynamic axes, and dim = 2
+    # Now, repeat interleaving can be performed in pytorch when the value of * matches
+    # with the number of elements in repeat, for example if * -> 2, number of repeats
+    # should be 2 as well.
+    else:
+        return opset9.repeat_interleave(g, self, repeats, final_dim)
+
+    reps_like = g.op(
+        "ConstantOfShape",
+        g.op("Shape", repeats),
+        value_t=torch.tensor([1], dtype=torch.long),
+    )
+    r_splits = split(g, repeats, reps_like, 0)
+    i_splits = split(g, self, reps_like, dim)
+
+    output_sizes[dim], input_sizes[dim] = -1, 1
+
+    # Create a loop to iterate over each value along the dimension
+    # and perform individual interleaving using the repeats tensor
+    # Loop is of the following pattern
+    # input (trip_count, cond)
+    #   int trip_count = ...;
+    #   bool cond = ...;
+    #   for (int i=0; i < trip_count && cond; ++i) {
+    #     cond = ...;
+    #   }
+
+    # Loop conditions
+    loop_condition = g.op("Constant", value_t=torch.tensor(1))
+    loop_condition = g.op("Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    loop_len = reps
+
+    # Create an empty sequence to store final expansions
+    final_splits = g.op("SequenceEmpty")
+
+    # Loop inputs
+    loop, (loop_context,), _ = jit_utils.add_op_with_blocks(
+        g, "Loop", loop_len, loop_condition, final_splits, n_blocks=1
+    )
+
+    loop_block = loop_context.block
+    block_input_iter = utils._add_input_to_block(loop_block)
+    cond = utils._add_input_to_block(loop_block)
+    final_splits = utils._add_input_to_block(loop_block)
+
+    r_split = loop_context.op("SequenceAt", r_splits, block_input_iter)
+    i_split = loop_context.op("SequenceAt", i_splits, block_input_iter)
+
+    i_split = opset11.unsqueeze(loop_context, i_split, dim + 1)
+    r_concat = [
+        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[: dim + 1])),
+        r_split,
+        loop_context.op("Constant", value_t=torch.LongTensor(input_sizes[dim + 1 :])),
+    ]
+    r_concat = loop_context.op("Concat", *r_concat, axis_i=0)
+    i_split = opset9.expand(loop_context, i_split, r_concat, None)
+    i_split = symbolic_helper._reshape_helper(
+        loop_context, i_split, g.op("Constant", value_t=torch.LongTensor(output_sizes))
+    )
+    final_splits = loop_context.op("SequenceInsert", final_splits, i_split)
+
+    # Loop outputs
+    cond_out = loop_context.op(
+        "Cast", loop_condition, to_i=_C_onnx.TensorProtoDataType.BOOL
+    )
+    utils._add_output_to_block(loop_block, cond_out)
+    utils._add_output_to_block(loop_block, final_splits)
+
+    loop_out = loop.node().output()
+    loop_out = g.op("ConcatFromSequence", loop_out, axis_i=dim)
+    return loop_out
+
+
+@_onnx_symbolic("aten::diagonal")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def diagonal(g: jit_utils.GraphContext, self, offset, dim1, dim2):
+    rank = symbolic_helper._get_tensor_rank(self)
+    # Replace negative indexing when rank is known
+    if rank is not None:
+        dim1 = dim1 if dim1 >= 0 else dim1 + rank
+        dim2 = dim2 if dim2 >= 0 else dim2 + rank
+
+    dim1_size = opset9.size(
+        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim1]))
+    )
+    dim2_size = opset9.size(
+        g, self, dim=g.op("Constant", value_t=torch.LongTensor([dim2]))
+    )
+    # Create appropriate mask
+    mask_shape = g.op("Concat", dim1_size, dim2_size, axis_i=0)
+    mask = opset9.zeros(g, mask_shape, None, None, None)
+    mask = g.op("EyeLike", mask, k_i=offset)
+    # dim1 and dim2 appended as a dimension at the end of the shape
+
+    if rank is not None:
+        axes = list(range(rank))
+        axes.remove(dim1)
+        axes.remove(dim2)
+        self = g.op("Transpose", self, perm_i=axes + [dim1, dim2])
+    else:
+        return symbolic_helper._unimplemented("diagonal", "unknown input rank")
+
+    # Multiply input and mask to calculate values along diagonal
+    # The mask consists of one values where diagonal values are to be calculated
+    # For example:
+    # [[1.1, 1.2, 1.3],   *    [[1, 0, 0]   =   [[1.1, 0, 0],
+    #  [2.1, 2.2, 2.3],         [0, 1, 0]        [0, 2.2, 0],
+    #  [3.1, 3.2, 3.3]]         [0, 0, 1]]       [0, 0, 3.3]]
+    result = g.op("Mul", self, mask)
+    result = symbolic_helper._reducesum_helper(g, result, axes_i=[-1], keepdims_i=0)
+
+    # Calculate gather indices based on offset and dims
+    # If offset is greater than zero, set offset to zero as this aids in
+    # calculation of selection window
+    offset_op = g.op("Constant", value_t=torch.LongTensor([offset]))
+    if offset >= 0:
+        diag_size = g.op(
+            "Max",
+            g.op("Min", dim1_size, g.op("Sub", dim2_size, offset_op)),
+            g.op("Constant", value_t=torch.LongTensor([0])),
+        )
+        offset = 0
+    else:
+        diag_size = g.op(
+            "Max",
+            g.op("Min", g.op("Add", dim1_size, offset_op), dim2_size),
+            g.op("Constant", value_t=torch.LongTensor([0])),
+        )
+    diag_size = g.op("Concat", diag_size, axis_i=0)
+
+    # Calculate which diagonal values to select
+    # For example, in cases with offsets:
+    # [[0, 1.1, 0]
+    #  [0, 0, 2.2]]
+    # we need to select the last two columns, so we create a tensor
+    # with all columns that are to be selected
+    # So in this example, it is [1, 2]
+    select_window_ones_fill = opset9.ones(g, diag_size, 4, None, None)
+    select_window = g.op(
+        "CumSum",
+        select_window_ones_fill,
+        g.op("Constant", value_t=torch.LongTensor([0])),
+    )
+    select_window = g.op(
+        "Add",
+        select_window,
+        g.op("Constant", value_t=torch.LongTensor([abs(offset) - 1])),
+    )
+
+    gather_shape = [
+        opset9.size(g, result, dim=g.op("Constant", value_t=torch.LongTensor([axis])))
+        for axis in list(range(rank))[:-2]
+    ]
+    gather_shape.append(diag_size)
+    gather_shape = g.op("Concat", *gather_shape, axis_i=0)
+    gather_indices = opset9.zeros(g, gather_shape, 4, None, None)
+
+    # There might be cases where offset value is greater than number of rows/columns
+    # and might cause the diagonal to overrun and as a result of this, diag_size would be zero.
+    # For example, if
+    #       offset = 9, dim1_size = 2 (columns), dim2_size = 4 (rows)
+    #       diag_size = max(min(2, (4-9)), 0) = 0, based on calculation above
+    # Cases with diagonal overrun always result in diag_size = max(0, -ve value) = 0
+    # In cases without diagonal overrun, we select the appropriate rows/columns along which we
+    # are calculating diagonal values. In cases with diagonal overrun, we return a tensor which has
+    # the dimension of the row/column where overrun occurred as 0-dim, as we are essentially
+    # returning an empty tensor
+    overrun_cond = g.op(
+        "Not",
+        g.op(
+            "Equal",
+            diag_size,
+            g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64)),
+        ),
+    )
+
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", overrun_cond, n_blocks=2
+    )
+
+    gather_indices_if_block = if_context.op("Add", gather_indices, select_window)
+    gather_indices_if_block = symbolic_helper._unsqueeze_helper(
+        if_context, gather_indices_if_block, [rank - 1]
+    )
+    final_non_overrun = if_context.op(
+        "GatherND", result, gather_indices_if_block, batch_dims_i=rank - 2
+    )
+    final_overrun = opset9.zeros(else_context, gather_shape, 6, None, None)
+    utils._add_output_to_block(if_context.block, final_non_overrun)
+    utils._add_output_to_block(else_context.block, final_overrun)
+    return if_op
+
+
+# Quantized ops
+
+
+@_onnx_symbolic("quantized::linear")
+def quantized_linear(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::linear_relu")
+def quantized_linear_relu(
+    g: jit_utils.GraphContext, q_input, q_weight, bias, op_scale, op_zero_point
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.linear(g, input, weight, bias)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d_relu")
+def quantized_conv1d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d_relu")
+def quantized_conv2d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d_relu")
+def quantized_conv3d_relu(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+    output = opset9.relu(g, output)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv1d")
+def quantized_conv1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv1d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv2d")
+def quantized_conv2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv2d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv3d")
+def quantized_conv3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv3d(g, input, weight, bias, stride, padding, dilation, groups)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose1d")
+def quantized_conv_transpose1d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose2d")
+def quantized_conv_transpose2d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose2d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+@_onnx_symbolic("quantized::conv_transpose3d")
+def quantized_conv_transpose3d(
+    g: jit_utils.GraphContext,
+    q_input,
+    q_weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    dilation,
+    groups,
+    op_scale,
+    op_zero_point,
+):
+    input, input_scale, _, _ = symbolic_helper.dequantize_helper(g, q_input)
+    weight, weight_scale, _, axis = symbolic_helper.dequantize_helper(g, q_weight)
+    q_bias = symbolic_helper.requantize_bias_helper(
+        g, bias, input_scale, weight_scale, axis
+    )
+    bias, _, _, _ = symbolic_helper.dequantize_helper(g, q_bias)
+
+    output = opset9.conv_transpose3d(
+        g, input, weight, bias, stride, padding, output_padding, groups, dilation
+    )
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset14.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset14.py
new file mode 100644
index 0000000000000000000000000000000000000000..2407af69a75d4a87636c4473866df039749b568c
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset14.py
@@ -0,0 +1,283 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 14.
+
+Note [ONNX operators that are added/updated in opset 14]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+    HardSwish, Trilu
+
+Updated operators:
+    Reshape
+    Add, Sub, Mul, Div
+    GRU, LSTM, RNN
+    BatchNorm, Cumsum, Relu
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+from __future__ import annotations
+
+import functools
+
+import torch
+from torch.onnx import _constants, _type_utils, symbolic_helper
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import jit_utils, registration
+
+
+__all__ = [
+    "hardswish",
+    "tril",
+    "triu",
+    "reshape",
+    "batch_norm",
+    "quantized_hardswish",
+    "scaled_dot_product_attention",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=14)
+
+
+@_onnx_symbolic("aten::hardswish")
+@symbolic_helper.parse_args("v")
+def hardswish(g: jit_utils.GraphContext, self):
+    return g.op("HardSwish", self)
+
+
+@_onnx_symbolic("aten::tril")
+def tril(g: jit_utils.GraphContext, self, diagonal, out=None):
+    return g.op("Trilu", self, diagonal, upper_i=0)
+
+
+@_onnx_symbolic("aten::triu")
+def triu(g: jit_utils.GraphContext, self, diagonal, out=None):
+    return g.op("Trilu", self, diagonal, upper_i=1)
+
+
+@_onnx_symbolic("aten::reshape")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v")
+def reshape(g: jit_utils.GraphContext, self, shape):
+    # NOTE: Due to bug in ORT https://github.com/microsoft/onnxruntime/issues/10664
+    #       Reshape export cannot utilize the new allowzero attribute introduced in opset 14.
+    return symbolic_helper._reshape_helper(g, self, shape, allowzero=0)
+
+
+@_onnx_symbolic("aten::batch_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
+def batch_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    training,
+    momentum,
+    eps,
+    cudnn_enabled,
+):
+    if (
+        torch.is_autocast_enabled()
+        and not symbolic_helper.args_have_same_dtype(
+            [input, weight, bias, running_mean, running_var]
+        )
+        and GLOBALS.export_onnx_opset_version < 15
+    ):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "BatchNormalization",
+            14,
+            15,
+            "All input tensors must have the same `dtype`."
+            " Turn off Autocast or export using opset version 15.",
+            input,
+        )
+
+    symbolic_helper.check_training_mode(training, "batch_norm")
+    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
+        g, input, weight, bias, running_mean, running_var
+    )
+    out = g.op(
+        "BatchNormalization",
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        epsilon_f=eps,
+        momentum_f=1 - momentum,
+        training_mode_i=0 if not training else 1,
+        outputs=1 if not training else 3,
+    )
+    if not training:
+        return out
+    else:
+        res, new_running_mean, new_running_var = out
+        new_running_mean.setType(running_mean.type())
+        new_running_var.setType(running_var.type())
+        return res
+
+
+@_onnx_symbolic("quantized::hardswish")
+def quantized_hardswish(g: jit_utils.GraphContext, x, op_scale, op_zero_point):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = hardswish(g, x)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+# Ported from
+# https://github.com/microsoft/onnxscript/blob/6b1b81700b4523f31d8c6d3321e5d8ef5d42b764/onnxscript/function_libs/torch_aten/ops/nn.py#L1504
+# aten_scaled_dot_product_attention
+# NOTE: Need op.Trilu
+@_onnx_symbolic("aten::scaled_dot_product_attention")
+@symbolic_helper.parse_args("v", "v", "v", "v", "f", "b", "v", "b")
+def scaled_dot_product_attention(
+    g: jit_utils.GraphContext,
+    query: torch._C.Value,
+    key: torch._C.Value,
+    value: torch._C.Value,
+    attn_mask: torch._C.Value | None = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: torch._C.Value | None = None,
+    enable_gqa: bool = False,
+):
+    assert (not is_causal) or (
+        is_causal and symbolic_helper._is_none(attn_mask)
+    ), "is_causal and attn_mask cannot be set at the same time"
+    assert not enable_gqa, "conversion of scaled_dot_product_attention not implemented if enable_gqa is True"
+
+    if symbolic_helper._is_none(scale):
+        scale = _attention_scale(g, query)
+
+    if is_causal:
+        attn_mask = _causal_attention_mask(g, query, key)
+
+    # Swap the last two axes of key
+    # NOTE: onnx-script has different logic here, because the attribute perms in
+    # transpose needs list of ints
+    key_shape_builtin = symbolic_helper._get_tensor_rank(key)
+    key_transposed_axes = list(range(key_shape_builtin))
+    key_transposed_axes[-1], key_transposed_axes[-2] = (
+        key_transposed_axes[-2],
+        key_transposed_axes[-1],
+    )
+    key_transposed = g.op("Transpose", key, perm_i=key_transposed_axes)
+
+    # https://github.com/pytorch/pytorch/blob/12da0c70378b5be9135c6fda62a9863bce4a4818/aten/src/ATen/native/transformers/attention.cpp#L653
+    # Scale q, k before matmul for stability see https://tinyurl.com/sudb9s96 for math
+    query_scaled = g.op("Mul", query, g.op("Sqrt", scale))
+    key_transposed_scaled = g.op("Mul", key_transposed, g.op("Sqrt", scale))
+    mul_qk = g.op("MatMul", query_scaled, key_transposed_scaled)
+
+    if symbolic_helper._is_none(attn_mask):
+        mul_qk_add = mul_qk
+    elif (
+        _type_utils.JitScalarType.from_value(attn_mask)
+        == _type_utils.JitScalarType.BOOL
+    ):
+        # Turn the Boolean mask to float: attn_mask.masked_fill(not attn_mask, -float('inf'))
+        const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
+        const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
+        attn_mask = g.op("Where", attn_mask, const_zero, const_neg_inf)
+        mul_qk_add = g.op("Add", mul_qk, attn_mask)
+    elif _type_utils.JitScalarType.from_value(attn_mask) in (
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.BFLOAT16,
+    ):
+        mul_qk_add = g.op("Add", mul_qk, attn_mask)
+    else:
+        raise ValueError(
+            f"Unsupported type for attn_mask: {_type_utils.JitScalarType.from_value(attn_mask)}"
+        )
+
+    attn_weight = g.op("Softmax", mul_qk_add, axis_i=-1)
+
+    if dropout_p != 0:
+        attn_weight = g.op(
+            "Dropout",
+            attn_weight,
+            g.op("Constant", value_t=torch.tensor(dropout_p, dtype=torch.float)),
+        )
+
+    return g.op("MatMul", attn_weight, value)
+
+
+def _attention_scale(
+    g: jit_utils.GraphContext, query: torch._C.Value
+) -> torch._C.Value:
+    """Calculate the scale factor for the attention result.
+
+    Args:
+        query: Tensor of shape [..., L, E]
+
+    Returns:
+        Scalar scale factor := 1 / math.sqrt(query.size(-1))
+    """
+    query_shape = g.op("Shape", query)
+    query_shape_last = g.op(
+        "Slice",
+        query_shape,
+        g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64)),
+        g.op(
+            "Constant", value_t=torch.tensor([_constants.INT64_MAX], dtype=torch.int64)
+        ),
+    )
+    embedding_size = g.op(
+        "Cast",
+        query_shape_last,
+        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
+    )
+    const_one = g.op("Constant", value_t=torch.tensor([1.0], dtype=torch.float))
+    scale = g.op("Div", const_one, g.op("Sqrt", embedding_size))
+    # Add a Cast to convert the scale back to original type
+    scale = g.op(
+        "Cast",
+        scale,
+        to_i=_type_utils.JitScalarType.from_value(query).onnx_type(),
+    )
+    return scale
+
+
+def _causal_attention_mask(
+    g: jit_utils.GraphContext, query: torch._C.Value, key: torch._C.Value
+) -> torch._C.Value:
+    """Create a causal mask for the given query and key tensors.
+
+    Equivalent to::
+        mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_mask = torch.zeros(L, S, dtype=torch.float)
+        attn_mask = attn_mask.masked_fill(not mask, -float("inf"))
+
+    Args:
+        query: Tensor of shape [..., L, E]
+        key: Tensor of shape [..., S, E]
+
+    Returns:
+        Tensor of shape [L, S]
+    """
+
+    query_shape = g.op("Shape", query)
+    key_shape = g.op("Shape", key)
+
+    last_idx = g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    second_last_idx = g.op("Constant", value_t=torch.tensor([-2], dtype=torch.int64))
+    target_length = g.op("Slice", query_shape, second_last_idx, last_idx)
+    source_length = g.op("Slice", key_shape, second_last_idx, last_idx)
+    # attn_mask = torch.ones(L, S) := {
+    size = g.op("Concat", target_length, source_length, axis_i=0)
+    const_one = g.op("Constant", value_t=torch.tensor([1.0]))
+    attn_mask = g.op("Expand", const_one, size)
+    # }
+    attn_mask = g.op("Trilu", attn_mask, upper_i=0)
+    # The causal mask has 0s in the lower triangle and -inf in the upper triangle.
+    const_zero = g.op("Constant", value_t=torch.tensor([0.0]))
+    const_neg_inf = g.op("Constant", value_t=torch.tensor([-float("inf")]))
+    attn_mask = g.op(
+        "Where", g.op("Equal", attn_mask, const_zero), const_neg_inf, const_zero
+    )
+    return attn_mask
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset15.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset15.py
new file mode 100644
index 0000000000000000000000000000000000000000..94ba4fcff2b1141a19e808cc26110135a26c986c
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset15.py
@@ -0,0 +1,80 @@
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 15.
+
+Note [ONNX operators that are added/updated in opset 15]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/master/docs/Changelog.md#version-15-of-the-default-onnx-operator-set
+New operators:
+    Bernoulli
+    CastLike
+    Optional
+    OptionalGetElement
+    OptionalHasElement
+
+Updated operators:
+    BatchNormalization https://github.com/onnx/onnx/pull/3545
+                        Backwards compatible
+                        TODO: test coverage for mixed types inputs.
+    Pow                https://github.com/onnx/onnx/pull/3412
+                        Backwards compatible
+                        TODO: bfloat16 support.
+    Shape              https://github.com/onnx/onnx/pull/3580
+                        Backwards compatible
+                        TODO: optional start/end attribute.
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+import functools
+
+import torch
+from torch import _C
+from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import jit_utils, registration
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=15)
+
+
+@_onnx_symbolic("aten::__is_")
+def aten__is_(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_none(other):
+        if isinstance(self.type(), _C.OptionalType):
+            none = g.op("OptionalHasElement", self)
+            return g.op("Not", none)
+        else:
+            return g.op("Constant", value_t=torch.BoolTensor([0]))
+    return opset9.eq(g, self, other)
+
+
+@_onnx_symbolic("aten::__isnot_")
+@opset9.wrap_logical_op_with_negation  # type: ignore[has-type]
+def aten__isnot_(g: jit_utils.GraphContext, self, other):
+    return aten__is_(g, self, other)
+
+
+@_onnx_symbolic("aten::bernoulli")
+def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
+    if out is not None and not symbolic_helper._is_none(out):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "out parameter is not supported for bernoulli", input
+        )
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "generator is not supported for bernoulli", input
+        )
+    if p is None or symbolic_helper._is_none(p):
+        return g.op("Bernoulli", input)
+    return opset9.bernoulli(g, input, p, generator, out)
+
+
+@_onnx_symbolic("prim::unchecked_cast")
+def prim_unchecked_cast(g: jit_utils.GraphContext, self):
+    # exists to refine the type of the Value
+    # if x is Optional[Tensor], unchecked_cast will cast
+    # x to Tensor, so the rest of the graph knows that x is a Tensor.
+    if isinstance(self.type(), _C.OptionalType):
+        return g.op("OptionalGetElement", self)
+
+    return self
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset16.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset16.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0ccfc72ce57a7df2a3bd9a11bbcc412b765af18
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset16.py
@@ -0,0 +1,185 @@
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 16.
+
+Note [ONNX Operators that are added/updated in opset 16]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-16-of-the-default-onnx-operator-set
+New operators:
+    GridSample https://github.com/onnx/onnx/pull/3557
+
+Updated operators:
+    Identity
+    If
+    LeakyRelu
+    Loop
+    PRelu
+    RoiAlign
+    Scan
+    ScatterElements
+    ScatterND
+    Where
+    GreaterOrEqual
+    LessOrEqual
+"""
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+import functools
+
+import torch
+from torch.nn.functional import (
+    GRID_SAMPLE_INTERPOLATION_MODES,
+    GRID_SAMPLE_PADDING_MODES,
+)
+from torch.onnx import _type_utils, errors, symbolic_helper, utils
+from torch.onnx._internal import jit_utils, registration
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=16)
+
+
+# note (mkozuki): Why `grid_sampler` instead of `grid_sample`?
+# Because `torch.nn.functional.grid_sample` calls `torch.grid_sampler`.
+@_onnx_symbolic("aten::grid_sampler")
+@symbolic_helper.parse_args("v", "v", "i", "i", "b")
+def grid_sampler(
+    g: jit_utils.GraphContext,
+    input,
+    grid,
+    mode_enum,
+    padding_mode_enum,
+    align_corners,
+):
+    # Check the input and grid tensor rank beforehand.
+    if symbolic_helper._get_tensor_rank(input) == 5:
+        return symbolic_helper._onnx_unsupported("GridSample with 5D volumetric input")
+    mode_s = {v: k for k, v in GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg]
+    padding_mode_s = {v: k for k, v in GRID_SAMPLE_PADDING_MODES.items()}[  # type: ignore[call-arg]
+        padding_mode_enum
+    ]
+    return g.op(
+        "GridSample",
+        input,
+        grid,
+        align_corners_i=int(align_corners),
+        mode_s=mode_s,
+        padding_mode_s=padding_mode_s,
+    )
+
+
+@_onnx_symbolic("aten::scatter_add")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
+    src_type = _type_utils.JitScalarType.from_value(
+        src, _type_utils.JitScalarType.UNDEFINED
+    )
+    src_sizes = symbolic_helper._get_tensor_sizes(src)
+    index_sizes = symbolic_helper._get_tensor_sizes(index)
+
+    if len(src_sizes) != len(index_sizes):
+        return symbolic_helper._unimplemented(
+            "scatter_add",
+            f"`index` ({index_sizes}) should have the same dimensionality as `src` ({src_sizes})",
+        )
+
+    # PyTorch only allows index shape <= src shape, so we can only consider
+    # taking index as subset size to src, like PyTorch does. When sizes for src
+    # and index are not matched or there are dynamic axes, we take index shape to
+    # slice src to accommodate.
+    if src_sizes != index_sizes or None in index_sizes:
+        adjusted_shape = g.op("Shape", index)
+        starts = g.op("Constant", value_t=torch.tensor([0] * len(index_sizes)))
+        src = g.op("Slice", src, starts, adjusted_shape)
+
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("ScatterElements", self, index, src, axis_i=dim, reduction_s="add")
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        if _type_utils.JitScalarType.from_value(self) != src_type:
+            src = g.op(
+                "Cast",
+                src,
+                to_i=_type_utils.JitScalarType.from_value(self).onnx_type(),
+            )
+
+        return g.op(
+            "ScatterElements",
+            self,
+            index,
+            src,
+            axis_i=dim,
+            reduction_s="add",
+        )
+
+
+@_onnx_symbolic("aten::scatter_reduce")
+@symbolic_helper.parse_args("v", "i", "v", "v", "s", "b")
+def scatter_reduce(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    dim: int,
+    index: torch._C.Value,
+    src: torch._C.Value,
+    reduce: str,
+    include_self: bool,
+):
+    if reduce == "mean":
+        raise errors.OnnxExporterError(
+            "ONNX does not support mean reduction for scatter_reduce"
+        )
+    if not include_self:
+        raise errors.OnnxExporterError(
+            "ONNX does not support include_self=False for scatter_reduce"
+        )
+
+    reduce_mode = {  # convert torch string name to onnx string name
+        "mean": "none",  # 'mean' doesn't support in ONNX 1.14 definition
+        "sum": "add",
+        "prod": "mul",
+        "amin": "min",
+        "amax": "max",
+    }
+    onnx_reduce = reduce_mode[reduce]
+
+    self_rank = g.op("Size", g.op("Shape", self))
+
+    # if self_rank == 0:  # assert (index_rank == 0 and rank_src == 0)
+    self_rank_is_zero = g.op(
+        "Equal", self_rank, g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+    )
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", self_rank_is_zero, n_blocks=2, outputs=3
+    )
+    neg_1 = if_context.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+
+    self_reshape = if_context.op("Reshape", self, neg_1)
+    utils._add_output_to_block(if_context.block, self_reshape)
+    index_reshape = if_context.op("Reshape", index, neg_1)
+    utils._add_output_to_block(if_context.block, index_reshape)
+    src_reshape = if_context.op("Reshape", src, neg_1)
+    utils._add_output_to_block(if_context.block, src_reshape)
+
+    self_identity = else_context.op("Identity", self)
+    utils._add_output_to_block(else_context.block, self_identity)
+    index_identitye = else_context.op("Identity", index)
+    utils._add_output_to_block(else_context.block, index_identitye)
+    src_identity = else_context.op("Identity", src)
+    utils._add_output_to_block(else_context.block, src_identity)
+
+    result = g.op("ScatterElements", *if_op, axis_i=dim, reduction_s=onnx_reduce)
+
+    # if self_rank == 0:
+    if_op, (if_context, else_context), _ = jit_utils.add_op_with_blocks(
+        g, "If", self_rank_is_zero, n_blocks=2, outputs=1
+    )
+    result_squeezed = if_context.op("Squeeze", result)
+    utils._add_output_to_block(if_context.block, result_squeezed)
+    result_identity = else_context.op("Identity", result)
+    utils._add_output_to_block(else_context.block, result_identity)
+    result_final = if_op.node().output()
+
+    return result_final
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset17.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset17.py
new file mode 100644
index 0000000000000000000000000000000000000000..e62ec00c9c7e1f9398f2bfed0641d428bc6737bd
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset17.py
@@ -0,0 +1,231 @@
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 17.
+
+Note [ONNX Operators that are added/updated in opset 17]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-17-of-the-default-onnx-operator-set
+New operators:
+    BlackmanWindow
+    DFT
+    HammingWindow
+    HannWindow
+    LayerNormalization
+    MelWeightMatrix
+    STFT
+    SequenceMap
+"""
+
+import functools
+from typing import Optional, Sequence
+
+import torch
+from torch import _C
+from torch.onnx import _type_utils, errors, symbolic_helper
+from torch.onnx._internal import jit_utils, registration
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = ["layer_norm", "stft", "quantized_layer_norm"]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=17)
+
+
+@_onnx_symbolic("aten::layer_norm")
+@symbolic_helper.parse_args("v", "is", "v", "v", "f", "none")
+def layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+    cudnn_enable: bool,
+):
+    # normalized_shape: input shape from an expected input of size
+    # axis: The first normalization dimension.
+    # layer_norm normalizes on the last D dimensions,
+    # where D is the size of normalized_shape
+    axis = -len(normalized_shape)
+    scalar_type = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.FLOAT
+    )
+    dtype = scalar_type.dtype()
+    if symbolic_helper._is_none(weight):
+        weight_value = torch.ones(normalized_shape, dtype=dtype)
+        weight = g.op("Constant", value_t=weight_value)
+    if symbolic_helper._is_none(bias):
+        bias_value = torch.zeros(normalized_shape, dtype=dtype)
+        bias = g.op("Constant", value_t=bias_value)
+    return g.op(
+        "LayerNormalization",
+        input,
+        weight,
+        bias,
+        epsilon_f=eps,
+        axis_i=axis,
+    )
+
+
+@_onnx_symbolic("quantized::layer_norm")
+def quantized_layer_norm(
+    g: jit_utils.GraphContext,
+    x,
+    normalized_shape,
+    weight,
+    bias,
+    eps,
+    op_scale,
+    op_zero_point,
+):
+    x, _, _, _ = symbolic_helper.dequantize_helper(g, x)
+
+    output = layer_norm(g, x, normalized_shape, weight, bias, eps, False)
+
+    return symbolic_helper.quantize_helper(g, output, op_scale, op_zero_point)
+
+
+def _compute_edge_sizes(n_fft, window_size):
+    """Helper function to compute the sizes of the edges (left and right)
+    of a given window centered within an FFT size."""
+    left = (n_fft - window_size) // 2
+    right = n_fft - left - window_size
+    return left, right
+
+
+@_onnx_symbolic("aten::stft")
+@symbolic_helper.parse_args("v", "i", "i", "i", "v", "b", "b", "b")
+def stft(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    n_fft: int,
+    hop_length: Optional[int] = None,
+    win_length: Optional[int] = None,
+    window: Optional[_C.Value] = None,
+    normalized: bool = False,
+    onesided: Optional[bool] = True,
+    return_complex: Optional[bool] = False,
+) -> _C.Value:
+    """Associates `torch.stft` with the `STFT` ONNX operator.
+    Note that torch.stft calls _VF.stft, without centering or padding options.
+    Hence, this function does not contain these two arguments.
+    See torch.stft source code for more info.
+
+    Args:
+        g: Graph to write the ONNX representation into
+        input: Input tensor for the transformation
+        n_fft: FFT size
+        hop_length: Size of the hop. Defaults to `floot(n_fft // 4)`
+        win_length: Size of the analysis window. Defaults to `n_fft`
+        window: Analysis window. Defaults to a window of all ones
+        normalized: Whether to return a normalized STFT
+        onesided: Whether to return only half (+1) of the results, given the
+            symmetry of the STFT
+        return_complex: Whether to return the complex value (Note: Must be
+            `False` or `None`)
+
+    Returns:
+        op: Operator for torch.stft associated with STFT (ONNX)
+    """
+    # Checks
+    if return_complex:
+        raise errors.SymbolicValueError(
+            msg="STFT does not currently support complex types", value=input
+        )
+
+    # Get STFT sizes
+    frame_step_value = hop_length if hop_length is not None else n_fft // 4
+    frame_step_const = g.op(
+        "Constant", value_t=torch.tensor(frame_step_value, dtype=torch.int64)
+    )
+    frame_length_const = g.op(
+        "Constant", value_t=torch.tensor(n_fft, dtype=torch.int64)
+    )
+
+    # Pre-process input if needed
+    signal = input
+    signal_rank = symbolic_helper._get_tensor_rank(signal)
+    if signal_rank == 1:
+        # Add batch dimension
+        signal = g.op(
+            "Unsqueeze",
+            signal,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+    elif signal_rank is None or signal_rank > 2:
+        raise errors.SymbolicValueError(
+            msg="STFT can only take inputs of 1 [signal] or 2 [batch, signal] dimensions. "
+            f"Current rank of signal is {signal_rank}, please reduce it.",
+            value=input,
+        )
+
+    # Get window and make sure it's the same size as `win_length` or `n_fft`
+    n_win = symbolic_helper._get_tensor_dim_size(window, dim=0)
+    if n_win is not None:
+        win_length_default = win_length if win_length else n_fft
+        assert n_win == win_length_default, (
+            "Analysis window size must equal `win_length` or `n_fft`. "
+            f"Please, set `win_length` or `n_fft` to match `window` size ({n_win})",
+        )
+
+        # Center window around zeros if needed (required by ONNX's STFT)
+        if n_win < n_fft:
+            left, right = _compute_edge_sizes(n_fft, n_win)
+            left_win = g.op("Constant", value_t=torch.zeros(left))
+            right_win = g.op("Constant", value_t=torch.zeros(right))
+            window = g.op("Concat", left_win, window, right_win, axis_i=0)
+
+    # Create window, if needed
+    if symbolic_helper._is_none(window):
+        if win_length:
+            if win_length > n_fft:
+                raise errors.SymbolicValueError(
+                    msg="The analysis window can't be longer than the size of the FFT. "
+                    f"Please set `win_length` ({win_length}) to `n_fft` ({n_fft}) or less.",
+                    value=input,
+                )
+
+            # Center window, if needed
+            left, right = _compute_edge_sizes(n_fft, win_length)
+            torch_window = torch.hstack(
+                (torch.zeros(left), torch.ones(win_length), torch.zeros(right))
+            )
+        else:
+            # Rectangle window
+            torch_window = torch.ones(n_fft)
+        assert torch_window.shape[0] == n_fft
+        window = g.op("Constant", value_t=torch_window)
+    window = g.op(
+        "Cast", window, to_i=_type_utils.JitScalarType.from_value(signal).onnx_type()
+    )
+
+    # Run STFT
+    result = g.op(
+        "STFT",
+        signal,
+        frame_step_const,
+        window,
+        frame_length_const,
+        onesided_i=1 if onesided is None or onesided else 0,
+    )
+
+    # Transpose to mimic torch.stft's behavior
+    result = g.op("Transpose", result, perm_i=[0, 2, 1, 3])
+
+    # Remove batch dimension, if needed
+    if signal_rank == 1:
+        result = g.op(
+            "Squeeze",
+            result,
+            g.op("Constant", value_t=torch.tensor([0], dtype=torch.int64)),
+        )
+
+    # Normalize, if needed
+    if normalized:
+        sqrt_nfft = torch.sqrt(torch.tensor(n_fft, dtype=signal.type().dtype()))
+        result = g.op("Div", result, g.op("Constant", value_t=sqrt_nfft))
+
+    return result
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset18.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset18.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcff5f746c0b6aa71a0b8fc2068a973d2e7a9b49
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset18.py
@@ -0,0 +1,265 @@
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 18.
+
+Note [ONNX Operators that are added/updated in opset 18]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-18-of-the-default-onnx-operator-set
+New operators:
+    BitwiseAnd
+    CenterCropPad
+    Col2Im
+    Mish
+    OptionalGetElement
+    OptionalHasElement
+    Pad
+    Resize
+    ScatterElements
+    ScatterND
+    Split
+"""
+
+import functools
+from typing import List, Optional, Sequence, Tuple
+
+import torch
+from torch import _C
+from torch.onnx import _type_utils, symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import jit_utils, registration
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__ = [
+    "col2im",
+]
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=18)
+
+
+@_onnx_symbolic("aten::__and_")
+@_onnx_symbolic("aten::bitwise_and")
+def __and_(g: jit_utils.GraphContext, self, other):
+    # do type promotion (scalars don't seem to apply)
+    args = [self, other]
+    # type promotion doesn't happen with torch.bitwise_and(tensor, scalar)
+    prom_args = [arg for arg in args if symbolic_helper._get_tensor_rank(arg)]
+    if len(prom_args) == 0:
+        prom_args = args
+    promotion_jit_type = symbolic_helper._type_promote_from_values(*prom_args)
+    self = symbolic_helper._maybe_cast_to_type(g, self, promotion_jit_type)
+    other = symbolic_helper._maybe_cast_to_type(g, other, promotion_jit_type)
+    if promotion_jit_type == _type_utils.JitScalarType.BOOL:
+        return g.op("And", self, other)
+    return g.op("BitwiseAnd", self, other)
+
+
+@_onnx_symbolic("aten::col2im")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is")
+def col2im(
+    g,
+    input: _C.Value,
+    output_size: _C.Value,
+    kernel_size: _C.Value,
+    dilation: Sequence[int],
+    padding: Sequence[int],
+    stride: Sequence[int],
+):
+    # convert [i0, i1, ..., in] into [i0, i0, i1, i1, ..., in, in]
+    adjusted_padding = []
+    for pad in padding:
+        for _ in range(2):
+            adjusted_padding.append(pad)
+
+    num_dimensional_axis = symbolic_helper._get_tensor_sizes(output_size)[0]
+    if not adjusted_padding:
+        adjusted_padding = [0, 0] * num_dimensional_axis
+
+    if not dilation:
+        dilation = [1] * num_dimensional_axis
+
+    if not stride:
+        stride = [1] * num_dimensional_axis
+
+    return g.op(
+        "Col2Im",
+        input,
+        output_size,
+        kernel_size,
+        dilations_i=dilation,
+        pads_i=adjusted_padding,
+        strides_i=stride,
+    )
+
+
+@_onnx_symbolic(
+    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
+)
+@_onnx_symbolic(
+    "aten::prod",
+    decorate=[
+        symbolic_helper._apply_params(
+            "ReduceProd", "prod", allow_multi_dim_support=False
+        )
+    ],
+)
+def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
+    return symbolic_helper._reduce_with_dtype_helper(
+        onnx_op, name, allow_multi_dim_support
+    )
+
+
+@_onnx_symbolic("aten::native_layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f")
+def _native_layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+) -> Tuple[_C.Value, _C.Value, _C.Value]:
+    return opset9.native_layer_norm(g, input, normalized_shape, weight, bias, eps)
+
+
+@_onnx_symbolic("aten::glu")
+@symbolic_helper.parse_args("v", "i")
+def _glu(g: jit_utils.GraphContext, input, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
+    if dim_size is not None:
+        assert dim_size % 2 == 0
+
+    first, second = g.op("Split", input, axis_i=dim, num_outputs_i=2, outputs=2)
+    return g.op("Mul", first, g.op("Sigmoid", second))
+
+
+@_onnx_symbolic("aten::max")
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+# TODO(justinchuby): Support multiple quantized args in output
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::maximum")
+@symbolic_helper.quantized_args(True, True)
+def maximum(g: jit_utils.GraphContext, input, other):
+    return max(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::min")
+# TODO(justinchuby): Support multiple quantized args in output
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::minimum")
+@symbolic_helper.quantized_args(True, True)
+def minimum(g: jit_utils.GraphContext, input, other):
+    return min(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::amax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amax(g: jit_utils.GraphContext, self, dim, keepdim):
+    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    return g.op("ReduceMax", self, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::amin")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amin(g: jit_utils.GraphContext, self, dim, keepdim):
+    axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+    return g.op("ReduceMin", self, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::aminmax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i")
+def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
+    if not symbolic_helper._is_none(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        axes = g.op("Constant", value_t=torch.tensor([dim], dtype=torch.long))
+        return g.op("ReduceMin", self, axes, keepdims_i=keepdim), g.op(
+            "ReduceMax", self, axes, keepdims_i=keepdim
+        )
+    else:
+        return g.op("ReduceMin", self, keepdims_i=keepdim), g.op(
+            "ReduceMax", self, keepdims_i=keepdim
+        )
+
+
+@_onnx_symbolic("aten::var_mean")
+def _var_mean(g: jit_utils.GraphContext, input, *args):
+    if len(args) == 1:
+        return symbolic_helper._var_mean_helper(g, input, None, args[0], None)
+    else:
+        return symbolic_helper._var_mean_helper(g, input, *args)
+
+
+@_onnx_symbolic("aten::logsumexp")
+@symbolic_helper.parse_args("v", "is", "i")
+def _logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
+    if dim is None:
+        return g.op("ReduceLogSumExp", input, keepdims_i=0)
+    else:
+        axes = g.op("Constant", value_t=torch.tensor(dim, dtype=torch.long))
+        return g.op("ReduceLogSumExp", input, axes, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::linalg_matrix_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+def _linalg_matrix_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: List[int],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return opset9.linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    return symbolic_helper._embedding_bag_helper(
+        g,
+        embedding_matrix,
+        indices,
+        offsets,
+        scale_grad_by_freq,
+        mode,
+        sparse,
+        per_sample_weights,
+        include_last_offset,
+        padding_idx,
+    )
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Optional[Sequence[int]],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset19.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset19.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e373526c842d4579c6498c43a13f7590d901f59
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset19.py
@@ -0,0 +1,33 @@
+"""This file exports ONNX ops for opset 19.
+
+Note [ONNX Operators that are added/updated in opset 19]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-19-of-the-default-onnx-operator-set
+New operators:
+AveragePool
+Cast
+CastLike
+Constant
+DeformConv
+DequantizeLinear
+Equal
+Identity
+If
+Loop
+Pad
+QuantizeLinear
+Reshape
+Resize
+Scan
+Shape
+Size
+"""
+
+from typing import List
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__: List[str] = []
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset20.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset20.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6fb582f252759a7d8495ddcee481881a86b887c
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset20.py
@@ -0,0 +1,92 @@
+# mypy: allow-untyped-defs
+"""This file exports ONNX ops for opset 20.
+
+Note [ONNX Operators that are added/updated in opset 20]
+
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+https://github.com/onnx/onnx/blob/main/docs/Changelog.md#version-20-of-the-default-onnx-operator-set
+New operators:
+    AffineGrid
+    ConstantOfShape
+    DFT
+    Gelu
+    GridSample
+    ImageDecoder
+    IsInf
+    IsNaN
+    ReduceMax
+    ReduceMin
+    RegexFullMatch
+    StringConcat
+    StringSplit
+"""
+
+import functools
+
+import torch.nn.functional as F
+from torch import _C
+from torch.onnx import symbolic_helper
+from torch.onnx._internal import jit_utils, registration
+
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in symbolic_helper.py
+
+__all__ = ["_grid_sampler", "_affine_grid_generator", "gelu"]
+
+
+def convert_grid_sample_mode(mode_s):
+    return (
+        "linear" if mode_s == "bilinear" else "cubic" if mode_s == "bicubic" else mode_s
+    )
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=20)
+
+
+@_onnx_symbolic("aten::grid_sampler")
+@symbolic_helper.parse_args("v", "v", "i", "i", "b")
+def _grid_sampler(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    grid: _C.Value,
+    mode_enum: int,
+    padding_mode_enum: int,
+    align_corners: bool,
+):
+    mode_s = {v: k for k, v in F.GRID_SAMPLE_INTERPOLATION_MODES.items()}[mode_enum]  # type: ignore[call-arg, index]
+    # mode string changes at https://onnx.ai/onnx/operators/text_diff_GridSample_16_20.html
+    mode_s = convert_grid_sample_mode(mode_s)
+    padding_mode_s = {v: k for k, v in F.GRID_SAMPLE_PADDING_MODES.items()}[  # type: ignore[call-arg, index]
+        padding_mode_enum  # type: ignore[index]
+    ]
+    return g.op(
+        "GridSample",
+        input,
+        grid,
+        align_corners_i=int(align_corners),
+        mode_s=mode_s,
+        padding_mode_s=padding_mode_s,
+    )
+
+
+@_onnx_symbolic("aten::affine_grid_generator")
+@symbolic_helper.parse_args("v", "v", "b")
+def _affine_grid_generator(
+    g: jit_utils.GraphContext,
+    theta: _C.Value,
+    size: _C.Value,
+    align_corners: bool,
+):
+    return g.op(
+        "AffineGrid",
+        theta,
+        size,
+        align_corners_i=int(align_corners),
+    )
+
+
+@_onnx_symbolic("aten::gelu")
+@symbolic_helper.parse_args("v", "s")
+def gelu(g: jit_utils.GraphContext, self: _C.Value, approximate: str = "none"):
+    return g.op("Gelu", self, approximate_s=approximate)
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset7.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset7.py
new file mode 100644
index 0000000000000000000000000000000000000000..0044ce714d84275642a22afb8654d90ca3f01dfa
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset7.py
@@ -0,0 +1,67 @@
+# mypy: allow-untyped-defs
+"""
+Note [ONNX operators that are added/updated from opset 7 to opset 8]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+  Expand
+
+Updated operators:
+  Min, Max, Sum, Mean: supports multidirectional broadcasting.
+  MaxPool: added optional indices output.
+  Scan
+"""
+
+import functools
+import warnings
+
+from torch.onnx import symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import jit_utils, registration
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=7)
+
+block_listed_operators = (
+    "scan",
+    "expand",
+    "expand_as",
+    "meshgrid",
+    "adaptive_max_pool1d",
+    "adaptive_max_pool2d",
+    "adaptive_max_pool3d",
+    "max_pool1d_with_indices",
+    "max_pool2d_with_indices",
+    "max_pool3d_with_indices",
+)
+
+
+# NOTE: max, min, sum, mean: broadcasting is not supported in opset 7.
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+@_onnx_symbolic("aten::max")
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.max(input, other)
+    if keepdim is None and dim_or_y is not None:
+        warnings.warn(
+            "Multidirectional broadcasting is not supported in opset 7. "
+            "This might cause the onnx model to be incorrect, if inputs to max operators "
+            "have different shapes"
+        )
+    return opset9.max(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::min")
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    # torch.min(input, other)
+    if keepdim is None and dim_or_y is not None:
+        warnings.warn(
+            "Multidirectional broadcasting is not supported in opset 7. "
+            "This might cause the onnx model to be incorrect, if inputs to min operators "
+            "have different shapes"
+        )
+    return opset9.min(g, self, dim_or_y, keepdim)
+
+
+for block_listed_op in block_listed_operators:
+    _onnx_symbolic(f"aten::{block_listed_op}")(
+        symbolic_helper._block_list_in_opset(block_listed_op)
+    )
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset8.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset8.py
new file mode 100644
index 0000000000000000000000000000000000000000..e490a55c21f57e6f05085044b89b6f1951b2407e
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset8.py
@@ -0,0 +1,463 @@
+# mypy: allow-untyped-defs
+"""
+Note [ONNX operators that are added/updated from opset 8 to opset 9]
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+New operators:
+    Compress
+    ConstantOfShape
+    EyeLike
+    MaxUnpool
+    OneHot
+    Sinh
+    Cosh
+    Asinh
+    Acosh
+    Atanh
+    Shrink
+    IsNaN
+    Sign
+    Erf
+    Scatter
+    Where
+    NonZero
+    TfIdfVectorizer
+    MeanVarianceNormalization
+
+Updated operators:
+    BatchNormalization: removed spatial attribute.
+    Greater, Less, Constant, MatMul, PRelu, Gemm, Flatten: more data types{integers} supported.
+    Cast: more data types{string} supported.
+    Upsample: moved scales from attribute to input.
+    Scan
+"""
+
+import functools
+import warnings
+
+import torch
+from torch._C import _onnx as _C_onnx
+from torch.onnx import _type_utils, errors, symbolic_helper, symbolic_opset9 as opset9
+from torch.onnx._internal import jit_utils, registration
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=8)
+
+block_listed_operators = (
+    "nonzero",
+    "where",
+    "scatter",
+    "scatter_add",
+    "erf",
+    "sign",
+    "isnan",
+    "gather",
+    "arange",
+    "masked_fill",
+    "index_fill",
+    "index_copy",
+    "repeat_interleave",
+    "any",
+    "all",
+)
+
+for block_listed_op in block_listed_operators:
+    _onnx_symbolic(f"aten::{block_listed_op}")(
+        symbolic_helper._block_list_in_opset(block_listed_op)
+    )
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest")],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[symbolic_helper._apply_params("upsample_linear1d", 3, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear")],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear")],
+)
+def _interpolate(name, dim, interpolate_mode):
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        output_size = symbolic_helper._maybe_get_const(output_size, "is")
+        if symbolic_helper._is_value(output_size):
+            return symbolic_helper._unimplemented(
+                name, "torch._C.Value (output_size) indexing"
+            )
+        if scales is None:
+            scales = [
+                1.0
+                if i < 2
+                else float(output_size[-(dim - i)])
+                / float(input.type().sizes()[-(dim - i)])
+                for i in range(0, dim)
+            ]
+        return g.op("Upsample", input, mode_s=interpolate_mode, scales_f=scales)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    align_corners = symbolic_helper._maybe_get_const(align_corners, "b")
+    if not symbolic_helper._is_none(align_corners) and align_corners:
+        return symbolic_helper._unimplemented("interpolate", "align_corners == True")
+
+    if not symbolic_helper._is_none(scale_factor) and symbolic_helper._is_value(
+        scale_factor
+    ):
+        return symbolic_helper._unimplemented(
+            "interpolate", "dynamic scales in opset 8"
+        )
+
+    if not symbolic_helper._is_none(size) and symbolic_helper._is_value(size):
+        return symbolic_helper._unimplemented("interpolate", "dynamic size in opset 8")
+
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Upsample", input, mode_s=mode, scales_f=scales)
+
+
+# NOTE: We should create a wrapper for this kind of operation, after resolving the shape/type propagation
+#       issue for "cast" operators. Some symbolic functions depend on shape information of input tensor, which
+#       is lost after casting.
+def _try_cast_integer_to_float(g: jit_utils.GraphContext, *args):
+    floating_scalar_types = {
+        _type_utils.JitScalarType.HALF,
+        _type_utils.JitScalarType.FLOAT,
+        _type_utils.JitScalarType.DOUBLE,
+    }
+    old_type = None
+    # Cast the input tensor to Float if its scalarType is known and is not floating number.
+    # If casting is performed, return the old scalarType, otherwise return None.
+    arg0_type = _type_utils.JitScalarType.from_value(
+        args[0], _type_utils.JitScalarType.UNDEFINED
+    )
+    if arg0_type != _type_utils.JitScalarType.UNDEFINED:
+        old_type = arg0_type
+        if old_type not in floating_scalar_types:
+            old_type = old_type.scalar_name()  # type: ignore[assignment]
+            args = tuple(
+                g.op("Cast", arg, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+                for arg in args
+            )
+        else:
+            return (None,) + args
+    else:
+        warnings.warn(
+            "Only floating datatype is supported for these operators: "
+            "{Greater, Less, MatMul, PRelu, Gemm, Flatten}. This might cause "
+            "the onnx model to be incorrect, if inputs have integer datatypes."
+        )
+    return (old_type,) + args
+
+
+def _cast_to_type(g: jit_utils.GraphContext, input, to_type):
+    if to_type is None:
+        return input
+    return getattr(opset9, f"_cast_{to_type}")(g, input, False)
+
+
+def _comparison_operator(g: jit_utils.GraphContext, input, other, op_name):
+    other = symbolic_helper._maybe_get_scalar(other)
+    other = symbolic_helper._if_scalar_type_as(other, input)
+    _, input, other = _try_cast_integer_to_float(g, input, other)
+    return g.op(op_name, input, other)
+
+
+# NOTE: For symbolics {gt, lt, bmm, matmul, prelu, mm, addmm, view, flatten},
+#       integer input type not supported in opset8. Cast to float if possible.
+@_onnx_symbolic("aten::gt")
+def gt(g: jit_utils.GraphContext, input, other):
+    return _comparison_operator(g, input, other, "Greater")
+
+
+@_onnx_symbolic("aten::lt")
+def lt(g: jit_utils.GraphContext, input, other):
+    return _comparison_operator(g, input, other, "Less")
+
+
+@_onnx_symbolic("aten::bmm")
+def bmm(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, other = _try_cast_integer_to_float(g, self, other)
+        return _cast_to_type(g, g.op("MatMul", self, other), old_type)
+    else:
+        return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::matmul")
+def matmul(g: jit_utils.GraphContext, self, other):
+    return bmm(g, self, other)
+
+
+@_onnx_symbolic("aten::prelu")
+def prelu(g: jit_utils.GraphContext, self, weight):
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
+    if self_rank is not None and self_rank > 2:
+        weight = g.op("Unsqueeze", weight, axes_i=list(range(1, self_rank - 1)))
+    elif self_rank == 0 and weight_sizes == [1]:
+        # self and weight are both scalar but weight has rank == 1, squeeze weight.
+        weight = symbolic_helper._squeeze_helper(g, weight, [0])
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, weight = _try_cast_integer_to_float(g, self, weight)
+        return _cast_to_type(g, g.op("PRelu", self, weight), old_type)
+    else:
+        return g.op("PRelu", self, weight)
+
+
+@_onnx_symbolic("aten::mm")
+def mm(g: jit_utils.GraphContext, self, other):
+    # Create a dummy C tensor. Only needed for API purposes, the value is
+    # since beta = 0
+    scalar_type = symbolic_helper._try_get_scalar_type(self, other)
+    if scalar_type is None:
+        raise errors.SymbolicValueError(
+            "mm can only operate on tensors with known types", self
+        )
+    zero_constant = g.op(
+        "Constant",
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, other, zero_constant = _try_cast_integer_to_float(
+            g, self, other, zero_constant
+        )
+        return _cast_to_type(
+            g,
+            g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0),
+            old_type,
+        )
+    return g.op("Gemm", self, other, zero_constant, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::addmm")
+@symbolic_helper.parse_args("v", "v", "v", "t", "t")
+def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
+    if symbolic_helper._try_get_scalar_type(self):
+        old_type, self, mat1, mat2 = _try_cast_integer_to_float(g, self, mat1, mat2)
+        return _cast_to_type(
+            g,
+            g.op(
+                "Gemm",
+                mat1,
+                mat2,
+                self,
+                beta_f=symbolic_helper._scalar(beta),
+                alpha_f=symbolic_helper._scalar(alpha),
+            ),
+            old_type,
+        )
+    else:
+        return g.op(
+            "Gemm",
+            mat1,
+            mat2,
+            self,
+            beta_f=symbolic_helper._scalar(beta),
+            alpha_f=symbolic_helper._scalar(alpha),
+        )
+
+
+@_onnx_symbolic("aten::flatten")
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    start_dim_i = symbolic_helper._get_const(start_dim, "i", "start_dim")
+    end_dim_i = symbolic_helper._get_const(end_dim, "i", "end_dim")
+
+    dim = input.type().dim()
+    if end_dim_i < 0:
+        end_dim_i = dim + end_dim_i
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim_i == 1 and end_dim_i == dim - 1:
+        if symbolic_helper._try_get_scalar_type(input):
+            old_type, input = _try_cast_integer_to_float(g, input)
+            return _cast_to_type(
+                g, g.op("Flatten", input, axis_i=start_dim_i), old_type
+            )
+        else:
+            return g.op("Flatten", input, axis_i=start_dim_i)
+    if start_dim_i == 0 and end_dim_i == dim - 2:
+        if symbolic_helper._try_get_scalar_type(input):
+            old_type, input = _try_cast_integer_to_float(g, input)
+            return _cast_to_type(
+                g, g.op("Flatten", input, axis_i=end_dim_i + 1), old_type
+            )
+        else:
+            return g.op("Flatten", input, axis_i=end_dim_i + 1)
+
+    return opset9.flatten(g, input, start_dim, end_dim)
+
+
+def _constant_fill(g: jit_utils.GraphContext, sizes, dtype: int, const_value):
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if not scalar_type.dtype().is_floating_point:
+        result = g.op(
+            "ConstantFill",
+            sizes,
+            dtype_i=_type_utils.JitScalarType.FLOAT.onnx_type(),
+            input_as_shape_i=1,
+            value_f=const_value,
+        )
+        return g.op("Cast", result, to_i=scalar_type.onnx_type())
+    else:
+        return g.op(
+            "ConstantFill",
+            sizes,
+            dtype_i=scalar_type.onnx_type(),
+            input_as_shape_i=1,
+            value_f=const_value,
+        )
+
+
+@_onnx_symbolic("aten::empty")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty(
+    g: jit_utils.GraphContext,
+    sizes,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::empty_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros_like(g, input, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::zeros")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    # NOTE: no way to set device and layout in ONNX, so we ignore it
+    return _constant_fill(g, sizes, dtype, 0)
+
+
+@_onnx_symbolic("aten::zeros_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def zeros_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, 0)
+
+
+@_onnx_symbolic("aten::ones")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    return _constant_fill(g, sizes, dtype, 1)
+
+
+@_onnx_symbolic("aten::ones_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def ones_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, 1)
+
+
+@_onnx_symbolic("aten::full")
+def full(
+    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
+):
+    const_value = symbolic_helper._maybe_get_const(value, "t")
+    if symbolic_helper._is_value(const_value):
+        tmp = zeros(g, sizes, dtype, layout, device)
+        return opset9.add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return _constant_fill(g, sizes, dtype, const_value)
+
+
+@_onnx_symbolic("aten::full_like")
+@symbolic_helper.parse_args("v", "f", "i", "v", "v", "v", "v")
+def full_like(
+    g: jit_utils.GraphContext,
+    input,
+    fill_value,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    return _constant_fill(g, shape, dtype, fill_value)
+
+
+@_onnx_symbolic("aten::repeat")
+def repeat(g: jit_utils.GraphContext, self, repeats):
+    if not symbolic_helper._is_value(repeats):
+        repeats = g.op("Constant", value_t=torch.LongTensor(repeats))
+    if symbolic_helper._is_packed_list(repeats):
+        repeat_size_len = len(symbolic_helper._unpack_list(repeats))
+    else:
+        const_repeats = symbolic_helper._maybe_get_const(repeats, "is")
+        repeat_size_len = len(const_repeats)
+    if self.isCompleteTensor():
+        sizes = self.type().sizes()
+        diff_dims = repeat_size_len - len(sizes)
+        if diff_dims > 0:
+            self = opset9.view(
+                g, self, g.op("Constant", value_t=torch.tensor([1] * diff_dims + sizes))
+            )
+    return g.op("Tile", self, repeats)
diff --git a/.venv/Lib/site-packages/torch/onnx/symbolic_opset9.py b/.venv/Lib/site-packages/torch/onnx/symbolic_opset9.py
new file mode 100644
index 0000000000000000000000000000000000000000..341071540c61292d813d1d3d0685ec4199311321
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/symbolic_opset9.py
@@ -0,0 +1,6637 @@
+# mypy: allow-untyped-decorators
+# mypy: allow-untyped-defs
+# mypy: disable-error-code=arg-type
+"""This file exports ONNX ops for opset 9.
+
+Opset 9 is supported by ONNX release 1.4.1
+release on 01/23/19
+"""
+
+from __future__ import annotations
+
+import builtins
+import functools
+import math
+import sys
+import warnings
+from typing import Callable, Sequence, TYPE_CHECKING
+
+import torch
+import torch._C._onnx as _C_onnx
+import torch.nn.modules.utils
+import torch.onnx
+from torch import _C
+
+# Monkey-patch graph manipulation methods on Graph, used for the ONNX symbolics
+from torch.onnx import _constants, _deprecation, _type_utils, errors, symbolic_helper
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import jit_utils, registration
+
+
+if TYPE_CHECKING:
+    from torch.types import Number
+
+# EDITING THIS FILE? READ THIS FIRST!
+# see Note [Edit Symbolic Files] in README.md
+
+__all__ = [
+    "abs",
+    "acos",
+    "add",
+    "addcmul",
+    "addmm",
+    "alias",
+    "amax",
+    "amin",
+    "aminmax",
+    "arange",
+    "argmax",
+    "argmin",
+    "as_strided",
+    "as_tensor",
+    "asin",
+    "atan",
+    "atan2",
+    "baddbmm",
+    "batch_norm",
+    "bernoulli",
+    "bitwise_not",
+    "bitwise_or",
+    "bmm",
+    "broadcast_tensors",
+    "broadcast_to",
+    "bucketize",
+    "cat",
+    "cdist",
+    "ceil",
+    "clamp_max",
+    "clamp_min",
+    "clamp",
+    "clone",
+    "constant_pad_nd",
+    "contiguous",
+    "conv_tbc",
+    "conv_transpose1d",
+    "conv_transpose2d",
+    "conv_transpose3d",
+    "conv1d",
+    "conv2d",
+    "conv3d",
+    "convert_element_type",
+    "convolution",
+    "cos",
+    "cosine_similarity",
+    "cross",
+    "cumsum",
+    "detach",
+    "dim",
+    "div",
+    "dot",
+    "dropout",
+    "elu",
+    "embedding_bag",
+    "embedding",
+    "empty_like",
+    "empty",
+    "eq",
+    "erf",
+    "exp",
+    "expand_as",
+    "expand",
+    "eye",
+    "fill",
+    "flatten",
+    "floor_divide",
+    "floor",
+    "floordiv",
+    "frobenius_norm",
+    "full_like",
+    "full",
+    "gather",
+    "ge",
+    "gelu",
+    "get_pool_ceil_padding",
+    "glu",
+    "group_norm",
+    "gt",
+    "hann_window",
+    "hardshrink",
+    "hardsigmoid",
+    "hardswish",
+    "hardtanh",
+    "index_add",
+    "index_copy",
+    "index_fill",
+    "index_put",
+    "index_select",
+    "index",
+    "instance_norm",
+    "is_floating_point",
+    "is_pinned",
+    "isnan",
+    "item",
+    "kl_div",
+    "layer_norm",
+    "le",
+    "leaky_relu",
+    "lerp",
+    "lift",
+    "linalg_cross",
+    "linalg_matrix_norm",
+    "linalg_norm",
+    "linalg_vector_norm",
+    "linear",
+    "linspace",
+    "log_sigmoid",
+    "log_softmax",
+    "log",
+    "log10",
+    "log1p",
+    "log2",
+    "logical_and",
+    "logical_not",
+    "logical_or",
+    "logical_xor",
+    "logit",
+    "logsumexp",
+    "lstm_cell",
+    "lstm",
+    "lt",
+    "masked_fill",
+    "masked_fill_",
+    "matmul",
+    "max_pool1d_with_indices",
+    "max_pool2d_with_indices",
+    "max_pool3d_with_indices",
+    "max",
+    "maximum",
+    "meshgrid",
+    "min",
+    "minimum",
+    "mish",
+    "mm",
+    "movedim",
+    "mse_loss",
+    "mul",
+    "multinomial",
+    "mv",
+    "narrow",
+    "native_layer_norm",
+    "ne",
+    "neg",
+    "new_empty",
+    "new_full",
+    "new_ones",
+    "new_zeros",
+    "nonzero_numpy",
+    "nonzero",
+    "norm",
+    "numel",
+    "numpy_T",
+    "one_hot",
+    "ones_like",
+    "ones",
+    "onnx_placeholder",
+    "pad",
+    "pairwise_distance",
+    "permute",
+    "pixel_shuffle",
+    "pixel_unshuffle",
+    "pow",
+    "prelu",
+    "prim_constant_chunk",
+    "prim_constant_split",
+    "prim_constant",
+    "prim_data",
+    "prim_device",
+    "prim_dtype",
+    "prim_if",
+    "prim_layout",
+    "prim_list_construct",
+    "prim_list_unpack",
+    "prim_loop",
+    "prim_max",
+    "prim_min",
+    "prim_shape",
+    "prim_tolist",
+    "prim_tuple_construct",
+    "prim_type",
+    "prim_unchecked_cast",
+    "prim_uninitialized",
+    "rand_like",
+    "rand",
+    "randint_like",
+    "randint",
+    "randn_like",
+    "randn",
+    "reciprocal",
+    "reflection_pad",
+    "relu",
+    "relu6",
+    "remainder",
+    "repeat_interleave",
+    "repeat",
+    "replication_pad",
+    "reshape_as",
+    "reshape",
+    "roll",
+    "rrelu",
+    "rsqrt",
+    "rsub",
+    "scalar_tensor",
+    "scatter_add",
+    "scatter",
+    "select",
+    "selu",
+    "sigmoid",
+    "sign",
+    "silu",
+    "sin",
+    "size",
+    "slice",
+    "softmax",
+    "softplus",
+    "softshrink",
+    "sort",
+    "split_with_sizes",
+    "split",
+    "sqrt",
+    "square",
+    "squeeze",
+    "stack",
+    "std_mean",
+    "std",
+    "sub",
+    "t",
+    "take",
+    "tan",
+    "tanh",
+    "tanhshrink",
+    "tensor",
+    "threshold",
+    "to",
+    "topk",
+    "transpose",
+    "true_divide",
+    "type_as",
+    "unbind",
+    "unfold",
+    "unsafe_chunk",
+    "unsafe_split_with_sizes",
+    "unsafe_split",
+    "unsqueeze",
+    "unsupported_complex_operators",
+    "noop_complex_operators",
+    "unused",
+    "var_mean",
+    "var",
+    "view_as",
+    "view",
+    "where",
+    "wrap_logical_op_with_cast_to",
+    "wrap_logical_op_with_negation",
+    "zeros_like",
+    "zeros",
+    "zero",
+]
+
+
+_onnx_symbolic = functools.partial(registration.onnx_symbolic, opset=9)
+
+
+def _export(name: str):
+    """Exports the function in the current global namespace."""
+
+    def wrapper(func):
+        globals()[name] = func
+        __all__.append(name)
+        return func
+
+    return wrapper
+
+
+def unused(g):
+    """Represents "missing" optional inputs."""
+    n = g.op("prim::Constant")
+    n.setType(_C.OptionalType.ofTensor())
+    return n
+
+
+@_onnx_symbolic("aten::_shape_as_tensor")
+def _shape_as_tensor(g: jit_utils.GraphContext, input):
+    return g.op("Shape", input)
+
+
+@_onnx_symbolic("aten::_reshape_from_tensor")
+def _reshape_from_tensor(g: jit_utils.GraphContext, input, shape):
+    if isinstance(shape, list):
+        shape = g.op("Concat", *shape, axis_i=0)
+    return reshape(g, input, shape)
+
+
+@_onnx_symbolic("aten::reshape")
+@symbolic_helper.quantized_args(True)
+def reshape(g: jit_utils.GraphContext, self, shape):
+    return symbolic_helper._reshape_helper(g, self, shape)
+
+
+@_onnx_symbolic("aten::reshape_as")
+@symbolic_helper.quantized_args(True)
+def reshape_as(g: jit_utils.GraphContext, self, other):
+    shape = g.op("Shape", other)
+    return reshape(g, self, shape)
+
+
+@_onnx_symbolic("aten::add")
+def add(g: jit_utils.GraphContext, self, other, alpha=None):
+    """
+    This function takes the add function and returns the corresponding ONNX operator.
+
+    This function is not meant to be called directly by the user.
+
+    Args:
+        g (GraphContext): The graph context.
+        self (Tensor): The first operand.
+        other (Tensor): The second operand.
+        alpha (float, optional): The scaling factor for the second operand. Defaults to None.
+
+    Returns:
+        ONNX operator.
+    """
+    if symbolic_helper._is_value(self) and symbolic_helper._is_tensor_list(self):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "Add", 9, 11, "Add between list of tensors not supported", self
+        )
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        other = g.op("Mul", other, alpha)
+    return g.op("Add", self, other)
+
+
+@_onnx_symbolic("aten::sub")
+def sub(g: jit_utils.GraphContext, self, other, alpha=None):
+    """
+    Consumes sub function and returns the corresponding ONNX operator.
+
+    This function is not meant to be called directly by the user.
+
+    Args:
+        g (GraphContext): The graph context.
+        self (Tensor): The first operand.
+        other (Tensor): The second operand.
+        alpha (Optional[Tensor]): A scaling factor to apply to the second operand.
+            If `alpha` is not provided, it defaults to 1.
+
+    Returns:
+        ONNX operator
+    """
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        other = g.op("Mul", other, alpha)
+    return g.op("Sub", self, other)
+
+
+@_onnx_symbolic("aten::rsub")
+def rsub(g: jit_utils.GraphContext, self, other, alpha=None):
+    return sub(g, other, self, alpha=alpha)
+
+
+@_onnx_symbolic("aten::mul")
+def mul(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_bool(self) and symbolic_helper._is_bool(other):
+        # ONNX Mul doesn't support Boolean, so use And as an equivalent operator.
+        return g.op("And", self, other)
+    else:
+        return g.op("Mul", self, other)
+
+
+@_onnx_symbolic("aten::div")
+def div(g: jit_utils.GraphContext, self, other, *args):
+    if len(args) == 0:
+        return true_divide(g, self, other)
+    else:
+        return _div_rounding_mode(g, self, other, *args)
+
+
+@_onnx_symbolic("aten::addcmul")
+@symbolic_helper.parse_args("v", "v", "v", "f")
+def addcmul(g: jit_utils.GraphContext, self, tensor1, tensor2, value=1.0):
+    value_tens = g.op("Constant", value_t=torch.tensor([value]))
+    return add(g, self, mul(g, mul(g, tensor1, tensor2), value_tens))
+
+
+@symbolic_helper.parse_args("v", "v", "s")
+def _div_rounding_mode(g: jit_utils.GraphContext, self, other, rounding_mode):
+    if rounding_mode is None:
+        return true_divide(g, self, other)
+    elif rounding_mode == "floor":
+        return _floor_divide(g, self, other)
+    elif rounding_mode == "trunc":
+        return _trunc_divide(g, self, other)
+    else:
+        raise errors.SymbolicValueError(
+            f'Unsupported rounding mode: "{rounding_mode}". Expected None, "floor" or "trunc"',
+            self,
+        )
+
+
+def _trunc_divide(g: jit_utils.GraphContext, self, other):
+    out = g.op("Div", self, other)
+    # the correct operation is truncate, which is not supported in ONNX,
+    # we cannot call floor since it will behave differently for negative numbers
+    # (eg. -0.1 should become -0 )
+    # - if scalar_type information are not available, assume that
+    # we need to call floor (treat as float)
+    out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+    # Matching PyTorch's behavior:
+    # - if self is fp the output's type is self's type
+    # - if self is not fp and other is fp, the output is of type JitScalarType.FLOAT
+    # - self is not fp and other is not fp, the output's type is self's output type
+    # - the output type defaults to Float
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    )
+    if scalar_type != _type_utils.JitScalarType.UNDEFINED:
+        if not symbolic_helper._is_fp(self) and symbolic_helper._is_fp(other):
+            out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+        else:
+            out = g.op(
+                "Cast",
+                out,
+                to_i=scalar_type.onnx_type(),
+            )
+    else:
+        out = g.op("Cast", out, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return out
+
+
+def _floor_divide(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        out = true_divide(g, self, other)
+        return g.op("Floor", out)
+    else:
+        # Integer division does trunction rounding
+        div = g.op("Div", self, other)
+        # Division is negative if: self < 0 != other < 0
+        zero = g.op("Constant", value_t=torch.tensor(0, dtype=torch.int64))
+        negative = g.op(
+            "Xor",
+            symbolic_helper._lt_helper(g, self, zero),
+            symbolic_helper._lt_helper(g, other, zero),
+        )
+
+        # For negative numbers with self % other != 0, subtract 1 to round down instead of up
+        mod = g.op("Sub", self, g.op("Mul", div, other))
+        fixup_mask = g.op("And", negative, g.op("Not", g.op("Equal", mod, zero)))
+
+        one = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        fixup = g.op("Mul", fixup_mask, one)
+        return g.op("Sub", div, fixup)
+
+
+@_onnx_symbolic("aten::floor_divide")
+def floor_divide(g: jit_utils.GraphContext, self, other):
+    # Deprecated behavior, floor_divide actually truncates
+    return _trunc_divide(g, self, other)
+
+
+@_onnx_symbolic("aten::floordiv")
+def floordiv(g: jit_utils.GraphContext, self, other):
+    return floor_divide(g, self, other)
+
+
+@_onnx_symbolic("aten::true_divide")
+def true_divide(g: jit_utils.GraphContext, self, other):
+    """Division where both inputs are cast to floating types
+
+    If both inputs are floating, performs div as usual
+    If only one input is a floating type, the other input is cast to its type
+    If neither input is a floating type, both inputs are cast to the default scalar type
+    """
+
+    # Case 1: either values are floating
+    # Performs div as usual.
+    # Implicit casting will be handled in scalar type analysis pass.
+    if symbolic_helper._is_fp(self) or symbolic_helper._is_fp(other):
+        return g.op("Div", self, other)
+
+    # Case 2: neither is floating
+    # Casts both inputs to the default scalar type
+    scalar_type = torch.get_default_dtype()
+    onnx_scalar_type = _C_onnx.TensorProtoDataType.FLOAT
+    assert scalar_type is torch.float or scalar_type is torch.double
+    if torch.get_default_dtype() is torch.double:
+        onnx_scalar_type = _C_onnx.TensorProtoDataType.DOUBLE
+
+    self = g.op("Cast", self, to_i=onnx_scalar_type)
+    other = g.op("Cast", other, to_i=onnx_scalar_type)
+    return g.op("Div", self, other)
+
+
+@_onnx_symbolic("aten::reciprocal")
+def reciprocal(g: jit_utils.GraphContext, self):
+    # torch.reciprocal implicitly casts to float, so we do the same.
+    if not symbolic_helper._is_fp(self):
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return g.op("Reciprocal", self)
+
+
+@_onnx_symbolic("aten::cat")
+@symbolic_helper.parse_args("v", "i")
+def cat(g: jit_utils.GraphContext, tensor_list, dim):
+    """Implement concatenation of pytorch tensors in ONNX along the specified `dim` dimension.
+
+    Parameters:
+        g (jit_utils.GraphContext): Graph context.
+        tensor_list (List[torch.Tensor]): List of tensors to concatenate.
+        dim (int): Dimension along which to concatenate the tensors.
+
+    Returns:
+        ONNX graph node representing the concatenated tensor.
+    """
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    # torch.cat ignores empty tensors such as `torch.Tensor([])`
+    # These needs to be removed as input from ONNX's concat too, otherwise shape inference
+    # will likely fail due to inputs with different ranks (0 for empty tensor, > 0 for anything else)
+    nonempty_tensors = []
+    for t in tensors:
+        if symbolic_helper._is_constant(t) and not symbolic_helper._get_tensor_dim_size(
+            t, 0
+        ):
+            continue
+        nonempty_tensors.append(t)
+    assert len(nonempty_tensors) > 0
+    assert all(
+        symbolic_helper._get_tensor_rank(nonempty_tensors[0]) is None
+        or symbolic_helper._get_tensor_rank(t) is None
+        or symbolic_helper._get_tensor_rank(t)
+        == symbolic_helper._get_tensor_rank(nonempty_tensors[0])
+        for t in nonempty_tensors
+    )
+    tensor_list.node().removeAllInputs()
+    for t in nonempty_tensors:
+        tensor_list.node().addInput(t)
+
+    tensors = symbolic_helper._unpack_list(tensor_list)
+    return g.op("Concat", *tensors, axis_i=dim)
+
+
+@_onnx_symbolic("aten::stack")
+@symbolic_helper.parse_args("v", "i")
+def stack(g: jit_utils.GraphContext, tensor_list, dim):
+    unsqueezed = [
+        symbolic_helper._unsqueeze_helper(g, t, [dim])
+        for t in symbolic_helper._unpack_list(tensor_list)
+    ]
+    return g.op("Concat", *unsqueezed, axis_i=dim)
+
+
+@_onnx_symbolic("aten::list")
+def _list(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::mm")
+def mm(g: jit_utils.GraphContext, self, other):
+    # Create a dummy C tensor. Only needed for API purposes, the value is
+    # since beta = 0
+    C = g.op("Constant", value_t=torch.tensor([1]))
+    return g.op("Gemm", self, other, C, beta_f=0.0, alpha_f=1.0)
+
+
+@_onnx_symbolic("aten::bmm")
+def bmm(g: jit_utils.GraphContext, self, other):
+    return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::matmul")
+def matmul(g: jit_utils.GraphContext, self, other):
+    return g.op("MatMul", self, other)
+
+
+@_onnx_symbolic("aten::addmm")
+@symbolic_helper.parse_args("v", "v", "v", "t", "t")
+def addmm(g: jit_utils.GraphContext, self, mat1, mat2, beta, alpha):
+    scalar_type = None
+    self_scalar_type = symbolic_helper._try_get_scalar_type(self)
+    mat1_scalar_type = symbolic_helper._try_get_scalar_type(mat1)
+    mat2_scalar_type = symbolic_helper._try_get_scalar_type(mat2)
+    if self_scalar_type is not None:
+        scalar_type = self_scalar_type
+    elif mat1_scalar_type is not None:
+        scalar_type = mat1_scalar_type
+    elif mat2_scalar_type is not None:
+        scalar_type = mat2_scalar_type
+
+    mat1_rank = symbolic_helper._get_tensor_rank(mat1)
+    mat2_rank = symbolic_helper._get_tensor_rank(mat2)
+
+    def is_not_none_nor(v, u):
+        return v is not None and v != u
+
+    if scalar_type is not None and (
+        is_not_none_nor(mat1_rank, 2) or is_not_none_nor(mat2_rank, 2)
+    ):
+        res1 = g.op("MatMul", mat1, mat2)
+        res2 = self
+
+        alpha = symbolic_helper._scalar(alpha)
+        beta = symbolic_helper._scalar(beta)
+
+        if alpha != 1:
+            alpha = g.op(
+                "Constant", value_t=torch.tensor(alpha, dtype=scalar_type.dtype())
+            )
+            res1 = g.op("Mul", res1, alpha)
+        if beta != 1:
+            beta = g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    symbolic_helper._scalar(beta), dtype=scalar_type.dtype()
+                ),
+            )
+            res2 = g.op("Mul", res2, beta)
+
+        return g.op("Add", res1, res2)
+
+    return g.op(
+        "Gemm",
+        mat1,
+        mat2,
+        self,
+        beta_f=symbolic_helper._scalar(beta),
+        alpha_f=symbolic_helper._scalar(alpha),
+    )
+
+
+@_onnx_symbolic("aten::neg")
+def neg(g: jit_utils.GraphContext, self):
+    return g.op("Neg", self)
+
+
+@_onnx_symbolic("aten::sqrt")
+def sqrt(g: jit_utils.GraphContext, self):
+    if _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+        _type_utils.JitScalarType.INT16,
+        _type_utils.JitScalarType.INT,
+        _type_utils.JitScalarType.INT64,
+    }:
+        # torch converts all int inputs to sqrt to float
+        self = g.op("Cast", self, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+    return g.op("Sqrt", self)
+
+
+@_onnx_symbolic("aten::rsqrt")
+def rsqrt(g: jit_utils.GraphContext, self):
+    return g.op(
+        "Div", symbolic_helper._if_scalar_type_as(torch.ones(1), self), sqrt(g, self)
+    )
+
+
+@_onnx_symbolic("aten::tanh")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qtanh.cpp
+@symbolic_helper.quantized_args(True, scale=2.0 / 256.0, zero_point=128)
+def tanh(g: jit_utils.GraphContext, self):
+    return g.op("Tanh", self)
+
+
+@_onnx_symbolic("aten::sin")
+def sin(g: jit_utils.GraphContext, self):
+    return g.op("Sin", self)
+
+
+@_onnx_symbolic("aten::cos")
+def cos(g: jit_utils.GraphContext, self):
+    return g.op("Cos", self)
+
+
+@_onnx_symbolic("aten::tan")
+def tan(g: jit_utils.GraphContext, self):
+    return g.op("Tan", self)
+
+
+@_onnx_symbolic("aten::asin")
+def asin(g: jit_utils.GraphContext, self):
+    return g.op("Asin", self)
+
+
+@_onnx_symbolic("aten::acos")
+def acos(g: jit_utils.GraphContext, self):
+    return g.op("Acos", self)
+
+
+@_onnx_symbolic("aten::atan")
+def atan(g: jit_utils.GraphContext, self):
+    return g.op("Atan", self)
+
+
+@_onnx_symbolic("aten::atan2")
+def atan2(g: jit_utils.GraphContext, self, other):
+    # self is y, and other is x on coordinate
+    slope = g.op("Div", self, other)
+    atan = g.op("Atan", slope)
+    const_zero = g.op("Constant", value_t=torch.tensor(0))
+    const_pi = g.op("Constant", value_t=torch.tensor(math.pi))
+
+    condition_second_or_third_quadrant = g.op("Greater", self, const_zero)
+    second_third_quadrant = g.op(
+        "Where",
+        condition_second_or_third_quadrant,
+        g.op("Add", atan, const_pi),
+        g.op("Sub", atan, const_pi),
+    )
+
+    condition_14_or_23_quadrant = g.op("Less", other, const_zero)
+    result = g.op("Where", condition_14_or_23_quadrant, second_third_quadrant, atan)
+
+    return result
+
+
+@_onnx_symbolic("aten::sigmoid")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qsigmoid.cpp
+@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
+def sigmoid(g: jit_utils.GraphContext, self):
+    """Converts the corresponding PyTorch function into ONNX operators.
+
+    It is not meant to be called directly by a user.
+
+    Args:
+        g (jit_utils.GraphContext): Graph context.
+        self (Tensor): the input tensor.
+    Returns:
+        ONNX operator
+    """
+    return g.op("Sigmoid", self)
+
+
+@_onnx_symbolic("aten::sign")
+def sign(g: jit_utils.GraphContext, self):
+    return g.op("Sign", self)
+
+
+@symbolic_helper.quantized_args(True)
+def _slice(g: jit_utils.GraphContext, input, axes, starts, ends):
+    assert len(starts) == len(ends)
+    if len(starts) == 1 and starts[0] == 0 and ends[0] == _constants.INT64_MAX:
+        return input
+    return g.op("Slice", input, axes_i=axes, starts_i=starts, ends_i=ends)
+
+
+@_onnx_symbolic(
+    "aten::sum", decorate=[symbolic_helper._apply_params("ReduceSum", "sum")]
+)
+@_onnx_symbolic(
+    "aten::mean", decorate=[symbolic_helper._apply_params("ReduceMean", "mean")]
+)
+# torch.prod does not support multidimensional "dim"
+@_onnx_symbolic(
+    "aten::prod",
+    decorate=[
+        symbolic_helper._apply_params(
+            "ReduceProd", "prod", allow_multi_dim_support=False
+        )
+    ],
+)
+def _reduce_with_dtype(onnx_op: str, name: str, allow_multi_dim_support: bool = True):
+    return symbolic_helper._reduce_with_dtype_helper(
+        onnx_op, name, allow_multi_dim_support
+    )
+
+
+@_onnx_symbolic("aten::cumsum")
+@symbolic_helper.parse_args("v", "i", "none")
+def cumsum(g: jit_utils.GraphContext, input, dim, dtype):
+    symbolic_helper._onnx_opset_unsupported("cumsum", 9, 11, input)
+
+
+@_onnx_symbolic("aten::_sample_dirichlet")
+def _sample_dirichlet(g: jit_utils.GraphContext, self, generator):
+    return symbolic_helper._onnx_unsupported("_sample_dirichlet", self)
+
+
+@_onnx_symbolic("aten::_standard_gamma")
+def _standard_gamma(g: jit_utils.GraphContext, self, generator):
+    return symbolic_helper._onnx_unsupported("_standard_gamma", self)
+
+
+@_onnx_symbolic("aten::t")
+def t(g: jit_utils.GraphContext, self):
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is None or rank < 2:
+        # The transpose of a 1d or 0d tensor is itself. ONNX does not define the behavior
+        # clearly and onnxruntime fails on these cases. So we add an Identity node to
+        # mirror the behavior of eager mode.
+        return g.op("Identity", self)
+    return g.op("Transpose", self, perm_i=(1, 0))
+
+
+@_onnx_symbolic("aten::numpy_T")
+@symbolic_helper.quantized_args(True)
+def numpy_T(g: jit_utils.GraphContext, input):
+    ndim = symbolic_helper._get_tensor_rank(input)
+    assert ndim is not None
+    perm = list(reversed(range(0, ndim)))
+    return g.op("Transpose", input, perm_i=perm)
+
+
+@_onnx_symbolic("aten::expand")
+@symbolic_helper.quantized_args(True)
+def expand(g: jit_utils.GraphContext, self, size, implicit):
+    """Implement the expand function for a pytorch tensor in ONNX according to specified `size`"""
+    size = symbolic_helper._maybe_get_const(size, "is")
+    if not symbolic_helper._is_value(size):
+        size = g.op("Constant", value_t=torch.LongTensor(size))
+    elif symbolic_helper._is_packed_list(size):
+        # Expand with -1 dim value means dim is unchanged.
+        # Since onnx::expand supports two-way broadcasting,
+        # -1 dim value can be exported to onnx as 1
+        size = symbolic_helper._reshape_helper(
+            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
+        )
+    dtype = _type_utils.JitScalarType.INT64
+    ones = ones_like(g, size, dtype)
+    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
+    size = where(g, g.op("Equal", size, neg_ones), ones, size)
+    return g.op("Expand", self, size)
+
+
+@_onnx_symbolic("aten::broadcast_to")
+@symbolic_helper.quantized_args(True)
+def broadcast_to(g: jit_utils.GraphContext, self, size):
+    size = symbolic_helper._maybe_get_const(size, "is")
+    if not symbolic_helper._is_value(size):
+        size = g.op("Constant", value_t=torch.LongTensor(size))
+    elif symbolic_helper._is_packed_list(size):
+        # Expand with -1 dim value means dim is unchanged.
+        # Since onnx::expand supports two-way broadcasting,
+        # -1 dim value can be exported to onnx as 1
+        size = symbolic_helper._reshape_helper(
+            g, stack(g, size, 0), g.op("Constant", value_t=torch.tensor([-1]))
+        )
+    dtype = _type_utils.JitScalarType.INT64
+    ones = ones_like(g, size, dtype)
+    neg_ones = mul(g, ones, g.op("Constant", value_t=torch.tensor(-1)))
+    size = where(g, g.op("Equal", size, neg_ones), ones, size)
+    return g.op("Expand", self, size)
+
+
+@_onnx_symbolic("aten::expand_as")
+@symbolic_helper.quantized_args(True, True)
+def expand_as(g: jit_utils.GraphContext, self, other):
+    self_t = symbolic_helper._maybe_get_const(self, "t")
+    if isinstance(self_t, torch.Tensor):
+        orig_type = self_t.dtype
+        self_t = self_t.to(torch.double)
+        dims = []
+        for d in range(self_t.dim()):
+            if torch.equal(self_t.mean(d).unsqueeze(d).expand_as(self_t), self_t):
+                dims.append(d)
+                self = g.op(
+                    "Constant", value_t=self_t.mean(dims, keepdim=True).to(orig_type)
+                )
+
+    shape = g.op("Shape", other)
+    return g.op("Expand", self, shape)
+
+
+@_onnx_symbolic("aten::embedding")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i", "b", "v")
+def embedding(
+    g: jit_utils.GraphContext,
+    weight,
+    indices,
+    padding_idx,
+    scale_grad_by_freq,
+    sparse,
+):
+    if scale_grad_by_freq and GLOBALS.export_training:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of embedding with scale_grad_by_freq=True "
+            "for training mode. ONNX does not support scaling the gradients.",
+            weight,
+        )
+    if padding_idx >= 0 and GLOBALS.export_training:
+        warnings.warn(
+            "Warning: ONNX export of embedding with padding_idx >= 0 "
+            "for training mode. "
+            "ONNX does not support not updating the embedding vector at padding_idx during training."
+        )
+
+    return g.op("Gather", weight, indices)
+
+
+@_onnx_symbolic("aten::embedding_bag")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "i", "v", "i", "i")
+def embedding_bag(
+    g: jit_utils.GraphContext,
+    embedding_matrix,
+    indices,
+    offsets,
+    scale_grad_by_freq,
+    mode,
+    sparse,
+    per_sample_weights,
+    include_last_offset,
+    padding_idx,
+):
+    if not symbolic_helper._is_none(per_sample_weights):
+        return symbolic_helper._onnx_unsupported(
+            "embedding_bag with per_sample_weights"
+        )
+
+    return symbolic_helper._onnx_unsupported("embedding_bag", embedding_matrix)
+
+
+@_onnx_symbolic("aten::size")
+@symbolic_helper.quantized_args(True, quantize_output=False)
+def size(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Shape", self)
+    if symbolic_helper._maybe_get_const(dim, "i") < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            dim = symbolic_helper._maybe_get_const(dim, "i") + rank
+            dim = g.op("Constant", value_t=torch.tensor(dim))
+    return symbolic_helper._size_helper(g, self, dim)
+
+
+@_onnx_symbolic("aten::transpose")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "i")
+def transpose(g: jit_utils.GraphContext, self, dim0, dim1):
+    if dim0 == dim1:  # micro-optimization
+        return self
+
+    # NB: Transpose in ONNX is actually a Permute
+    rank = symbolic_helper._get_tensor_rank(self)
+    if rank is not None:
+        axes = list(range(rank))
+        axes[dim0], axes[dim1] = axes[dim1], axes[dim0]
+        return g.op("Transpose", self, perm_i=axes)
+    else:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of transpose for tensor of unknown rank.",
+            self,
+        )
+
+
+@_onnx_symbolic("aten::permute")
+@symbolic_helper.parse_args("v", "is")
+def permute(g: jit_utils.GraphContext, self, dims):
+    if dims == list(range(0, len(dims))):
+        return self
+    return g.op("Transpose", self, perm_i=dims)
+
+
+@_onnx_symbolic("aten::view")
+@symbolic_helper.quantized_args(True)
+def view(g: jit_utils.GraphContext, self, size):
+    return reshape(g, self, size)
+
+
+@_onnx_symbolic("aten::view_as")
+def view_as(g: jit_utils.GraphContext, self, other):
+    shape = g.op("Shape", other)
+    return reshape(g, self, shape)
+
+
+@_onnx_symbolic("aten::unsafe_chunk")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def unsafe_chunk(g: jit_utils.GraphContext, self, chunks, dim, _outputs=None):
+    if _outputs is None:
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "unsafe_chunk", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented(
+            "unsafe_chunk", "unknown dimension size", self
+        )
+    split_size = (size + chunks - 1) // chunks
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::split")
+@symbolic_helper.parse_args("v", "v", "i", "i")
+def split(g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_size_or_sizes, _outputs):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "split", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    split_val = symbolic_helper._node_get(split_size_or_sizes.node(), "value")
+    if split_val.dim() > 0:
+        return split_with_sizes(g, self, split_size_or_sizes, dim, _outputs)
+    split_size = symbolic_helper._get_const(split_size_or_sizes, "i", "split_size")
+
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        if _outputs is not None:
+            size = split_size * _outputs
+        else:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "split", 9, 11, "Unknown dimension size not supported", self
+            )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split")
+def unsafe_split(
+    g: jit_utils.GraphContext, self, split_size_or_sizes, dim, _outputs=None
+):
+    return split(g, self, split_size_or_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::split_with_sizes")
+@symbolic_helper.parse_args("v", "is", "i", "i")
+def split_with_sizes(g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None):
+    if not symbolic_helper._is_split_static(split_sizes, _outputs):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "split_with_sizes", 9, 11, "Dynamic number of outputs not supported", self
+        )
+    return g.op("Split", self, split_i=split_sizes, axis_i=dim, outputs=_outputs)
+
+
+@_onnx_symbolic("aten::unsafe_split_with_sizes")
+def unsafe_split_with_sizes(
+    g: jit_utils.GraphContext, self, split_sizes, dim, _outputs=None
+):
+    return split_with_sizes(g, self, split_sizes, dim, _outputs)
+
+
+@_onnx_symbolic("aten::unbind")
+@symbolic_helper.parse_args("v", "i", "i")
+def unbind(g: jit_utils.GraphContext, self, dim=0, _outputs=None):
+    if _outputs is None:
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "unbind", 9, 11, "Dynamic number of outputs not supported", self
+        )
+
+    outputs = g.op("Split", self, split_i=[1] * _outputs, axis_i=dim, outputs=_outputs)
+    outputs = [outputs] if _outputs == 1 else outputs
+    squeezed_outputs = [
+        symbolic_helper._squeeze_helper(g, out, [dim]) for out in outputs
+    ]
+    return squeezed_outputs
+
+
+@_onnx_symbolic("aten::select")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "i", "v")
+def select(g: jit_utils.GraphContext, self, dim, index):
+    """Implement the select functionality for a pytorch tensor in ONNX.
+
+    Selects elements from the input tensor along the specified `dim` dimension based on the `index` tensor.
+    """
+    index = symbolic_helper._maybe_get_scalar(index)
+    if (not symbolic_helper._is_value(index)) and (index < 0):
+        if index == -1:
+            end_index = _constants.INT64_MAX
+        else:
+            end_index = index + 1
+        slice_node = symbolic_helper._slice_helper(
+            g, self, axes=[dim], starts=[index], ends=[end_index]
+        )
+        return symbolic_helper._squeeze_helper(g, slice_node, [dim])
+    else:
+        # FIXME(justinchuby): can index be an int and not a value?
+        return g.op("Gather", self, index, axis_i=dim)
+
+
+@_onnx_symbolic("aten::square")
+def square(g: jit_utils.GraphContext, self):
+    return g.op("Mul", self, self)
+
+
+@_onnx_symbolic("aten::squeeze")
+def squeeze(g: jit_utils.GraphContext, self, dim=None):
+    if dim is None:
+        return g.op("Squeeze", self)
+
+    squeeze_dim = symbolic_helper._get_const(dim, "i", "dim")
+    # Handle negative dims
+    if squeeze_dim < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            warnings.warn(
+                "ONNX export squeeze with negative axis "
+                + str(squeeze_dim)
+                + " might cause the onnx model to be incorrect. "
+                + "Negative axis is not supported in ONNX. "
+                + "Axis is converted to "
+                + str(squeeze_dim + rank)
+                + " based on input shape at export time. "
+                + "Passing an tensor of different rank in execution will be incorrect."
+            )
+            squeeze_dim += rank
+        else:
+            return symbolic_helper._unimplemented(
+                "squeeze", "negative axis with unknown input rank", self
+            )
+
+    dim_size = symbolic_helper._get_tensor_dim_size(self, squeeze_dim)
+    if dim_size is None:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(squeeze_dim)
+            + " on an input "
+            + "with unknown shape. Note that if the size of dimension "
+            + str(squeeze_dim)
+            + " of the input "
+            + "is not 1, the ONNX model will return an error. Opset version 11 supports squeezing on "
+            + "non-singleton dimensions, it is recommended to export this model using opset "
+            + "version 11 or higher."
+        )
+        return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
+    if dim_size > 1:
+        warnings.warn(
+            "This model contains a squeeze operation on dimension "
+            + str(squeeze_dim)
+            + ". The size of "
+            + "this dimension in the given input is "
+            + str(dim_size)
+            + ". The model will "
+            + "be exported without the squeeze node. If the model is intended to be used with dynamic "
+            + "input shapes, please use opset version 11 to "
+            + "export the model."
+        )
+        return self
+
+    warnings.warn(
+        "This model contains a squeeze operation on dimension "
+        + str(squeeze_dim)
+        + ". If the model is "
+        + "intended to be used with dynamic input shapes, please use opset version 11 to export the model."
+    )
+    return symbolic_helper._squeeze_helper(g, self, axes_i=[squeeze_dim])
+
+
+@_onnx_symbolic("aten::prelu")
+def prelu(g: jit_utils.GraphContext, self, weight):
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    weight_sizes = symbolic_helper._get_tensor_sizes(weight)
+    weight_rank = len(weight_sizes)
+    if self_rank is not None:
+        if self_rank > 2:
+            # make weight unidirectional broadcastable
+            weight = symbolic_helper._unsqueeze_helper(
+                g, weight, list(range(1, self_rank - 1))
+            )
+        elif self_rank == 0 and weight_sizes == [1]:
+            # self and weight are both scalar but weight has rank == 1, squeeze weight.
+            weight = symbolic_helper._squeeze_helper(g, weight, [0])
+            weight_rank = 0
+
+    if self_rank is not None and weight_rank is not None:
+        assert (
+            self_rank >= weight_rank
+        ), f"rank(x) should be >= rank(slope) but got {self_rank} < {weight_rank}"
+    return g.op("PRelu", self, weight)
+
+
+@_onnx_symbolic("aten::silu")
+def silu(g: jit_utils.GraphContext, input):
+    return g.op("Mul", input, g.op("Sigmoid", input))
+
+
+@_onnx_symbolic("aten::mish")
+def mish(g: jit_utils.GraphContext, input):
+    return g.op("Mul", input, g.op("Tanh", g.op("Softplus", input)))
+
+
+@_onnx_symbolic("aten::relu")
+@symbolic_helper.quantized_args(True)
+def relu(g: jit_utils.GraphContext, input):
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Relu", input, opset_before=14
+    )
+
+
+@_onnx_symbolic("aten::relu6")
+@symbolic_helper.quantized_args(True)
+def relu6(g: jit_utils.GraphContext, input):
+    return clamp(g, input, 0, 6)
+
+
+@_onnx_symbolic("aten::ceil")
+def ceil(g: jit_utils.GraphContext, input):
+    return g.op("Ceil", input)
+
+
+@_onnx_symbolic("aten::floor")
+def floor(g: jit_utils.GraphContext, input):
+    return g.op("Floor", input)
+
+
+@_onnx_symbolic("aten::len")
+def _len(g: jit_utils.GraphContext, self):
+    sz_0 = size(g, self, g.op("Constant", value_t=torch.LongTensor([0])))
+    return symbolic_helper._squeeze_helper(g, sz_0, [0])
+
+
+@_onnx_symbolic("aten::threshold")
+@symbolic_helper.parse_args("v", "t", "t")
+def threshold(g: jit_utils.GraphContext, self, threshold, value):
+    # See Note [Export inplace]
+    if symbolic_helper._scalar(threshold) != 0:
+        return symbolic_helper._unimplemented("threshold", "non-zero threshold", self)
+    if symbolic_helper._scalar(value) != 0:
+        return symbolic_helper._unimplemented("threshold", "non-zero value", self)
+    return g.op("Relu", self)
+
+
+@_onnx_symbolic("aten::leaky_relu")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "b")
+def leaky_relu(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    negative_slope: float,
+    inplace: bool = False,
+):
+    # See Note [Export inplace]
+    return g.op("LeakyRelu", input, alpha_f=negative_slope)
+
+
+@_onnx_symbolic("aten::glu")
+@symbolic_helper.parse_args("v", "i")
+def glu(g: jit_utils.GraphContext, input, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(input, dim)
+    if dim_size is not None:
+        assert dim_size % 2 == 0
+
+    first, second = g.op("Split", input, axis_i=dim, outputs=2)
+    return g.op("Mul", first, g.op("Sigmoid", second))
+
+
+@_onnx_symbolic("aten::softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    # Softmax does normalization at vector level.
+    # PyTorch and ONNX use different strategies to split the input tensor into vectors.
+    # Thus dim and axis have different meanings.
+    # PyTorch slices the input tensor into vectors along the `dim`-th dimension.
+    # ONNX reshapes the input into a 2-D tensor, and `axis` indicates where the input is coerced.
+    # If input is a 2 x 3 tensor:
+    # input = [[1.0, 1.0, 1.0],
+    #          [1.0, 1,0, 1,0]]
+    # with dim = 0, the result is:
+    # result = [[0.5, 0.5, 0.5],
+    #           [0.5, 0.5, 0.5]]
+    # with axis = 0, the result is:
+    # result = [[0.167, 0.167, 0.167],
+    #           [0.167, 0.167, 0.167]]
+    # So only when dim and axis both equal to ndim - 1 (the last dimension),
+    # their semantics are equivalent.
+    # So use softmax when dim and axis both equal to ndim - 1,
+    # otherwise transpose the input to put the vectors to be normalized to the last dimension.
+    # When input rank is not known at export time we compute softmax using a subgraph
+    # with other operators
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is not None:
+        # TODO: remove this as onnx opset 11 spec allows negative axes
+        if dim < 0:
+            dim = input_dim + dim
+
+        is_transpose_required = input_dim != dim + 1
+
+        if is_transpose_required:
+            axes = list(range(input_dim))
+            axes[dim], axes[-1] = axes[-1], axes[dim]
+            input = g.op("Transpose", input, perm_i=axes)
+            dim = input_dim - 1
+
+        softmax = g.op("Softmax", input, axis_i=dim)
+        if dtype and dtype.node().kind() != "prim::Constant":
+            parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+            softmax = g.op(
+                "Cast",
+                softmax,
+                to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type(),
+            )
+
+        if is_transpose_required:
+            softmax = g.op("Transpose", softmax, perm_i=axes)  # type: ignore[possibly-undefined]
+        return softmax
+
+    # Apply max normalization.
+    input = g.op("Sub", input, g.op("ReduceMax", input, axes_i=[dim], keepdims_i=1))
+
+    exp = g.op("Exp", input)
+    sum = symbolic_helper._reducesum_helper(g, exp, axes_i=[dim])
+    softmax = g.op("Div", exp, sum)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        softmax = g.op(
+            "Cast", softmax, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    return softmax
+
+
+@_onnx_symbolic("aten::softplus")
+def softplus(g: jit_utils.GraphContext, self, beta, threshold):
+    beta_const = symbolic_helper._maybe_get_const(beta, "f")
+    if beta_const != 1:
+        return g.op("Div", g.op("Softplus", g.op("Mul", self, beta)), beta)
+    return g.op("Softplus", self)
+
+
+@_onnx_symbolic("aten::get_pool_ceil_padding")
+def get_pool_ceil_padding(input, kernel_size, stride, padding):
+    # TODO(justinchuby): Looks like this op is deprecated in torch
+    sizes = symbolic_helper._get_tensor_sizes(input)
+    dim = sizes[-len(padding) :] if sizes is not None else None
+    if dim is None or any(i is None for i in dim):
+        return symbolic_helper._unimplemented(
+            "get_pool_ceil_padding", "input size not accessible", input
+        )
+    ceiled_output_dim = [
+        int(math.ceil((dim[i] + 2 * padding[i] - kernel_size[i]) / float(stride[i])))
+        + 1
+        for i in range(0, len(padding))
+    ]
+    # ensure last pooling starts inside
+    ceiled_output_dim = [
+        (
+            ceiled_output_dim[i] - 1
+            if (((ceiled_output_dim[i] - 1) * stride[i]) >= (dim[i] + padding[i]))
+            else ceiled_output_dim[i]
+        )
+        for i in range(0, len(ceiled_output_dim))
+    ]
+    padding_ceil = [
+        (
+            0
+            if (stride[i] == 1)
+            else (
+                kernel_size[i]
+                - (
+                    dim[i]
+                    + 2 * padding[i]
+                    - ((ceiled_output_dim[i] - 1) * stride[i] + 1)
+                )
+            )
+        )
+        for i in range(0, len(padding))
+    ]
+    # ensure padding is not > kernel_size
+    padding_ceil = [
+        (
+            (
+                int(padding_ceil[i])
+                if padding_ceil[i] < kernel_size[i] - 1
+                else int(kernel_size[i] - 1)
+            )
+            if ((padding_ceil[i] + 2 * padding[i]) >= (kernel_size[i]))
+            else int(padding_ceil[i])
+        )
+        for i in range(0, len(padding_ceil))
+    ]
+    return padding_ceil
+
+
+@_onnx_symbolic(
+    "aten::max_pool1d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool1d", torch.nn.modules.utils._single, 1, return_indices=False
+        ),
+        _export("max_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool2d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool2d", torch.nn.modules.utils._pair, 2, return_indices=False
+        ),
+        _export("max_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::max_pool3d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "max_pool3d", torch.nn.modules.utils._triple, 3, return_indices=False
+        ),
+        _export("max_pool3d"),
+    ],
+)
+def _max_pool(name, tuple_fn, ndims, return_indices):
+    @symbolic_helper.quantized_args(True, False, False, False, False, False)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "is", "i")
+    def symbolic_fn(g, input, kernel_size, stride, padding, dilation, ceil_mode):
+        if set(tuple_fn(dilation)) != {1}:
+            return symbolic_helper._unimplemented(name, "dilation", input)
+        if not stride:
+            stride = kernel_size
+        padding = tuple(tuple_fn(padding))
+        if ceil_mode:
+            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
+            padding = padding + tuple(a + b for (a, b) in zip(padding_ceil, padding))
+        else:
+            padding = padding * 2
+        kwargs = {
+            "kernel_shape_i": tuple_fn(kernel_size),
+            "pads_i": padding,
+            "strides_i": tuple_fn(stride),
+        }
+        # easy but hacky way to get flattened indices values
+        # to be used to convert the indices values to non-flattened.
+        # In ONNX the indices are computed as a flatten 1-D tensor,
+        # so the values in indices are in [0, N x C x D1 x ... x Dn).
+        # To convert the indices to the same format used by Pytorch,
+        # we first execute a maxpool with a kernel and stride of 1 on the same input.
+        # This will result in a tensor of indices in which each index will have it's own value.
+        # Using this tensor as a reference, we extract the first index of each axis and subtract
+        # it from each index of this axis in the indices to convert.
+        # This step will result in a tensor were each dimension has values of indices within
+        # the dimension it is in.
+        # For more information :
+        # https://github.com/pytorch/pytorch/pull/16455#issuecomment-460776407
+        if return_indices:
+            r, indices = g.op("MaxPool", input, outputs=2, **kwargs)
+            _, flattened_indices = g.op(
+                "MaxPool",
+                input,
+                outputs=2,
+                kernel_shape_i=[1 for _ in range(ndims)],
+                strides_i=[1 for _ in range(ndims)],
+            )
+            # convert indices to have non-flattened indices values
+            s = symbolic_helper._slice_helper(
+                g,
+                flattened_indices,
+                axes=[2 + i for i in range(ndims)],
+                starts=list(tuple_fn(0)),
+                ends=list(tuple_fn(1)),
+            )
+            indices = sub(g, indices, s)
+            return r, indices
+        else:
+            r = g.op("MaxPool", input, outputs=1, **kwargs)
+            return r
+
+    return symbolic_fn
+
+
+max_pool1d_with_indices = _onnx_symbolic("aten::max_pool1d_with_indices")(
+    _max_pool(
+        "max_pool1d_with_indices",
+        torch.nn.modules.utils._single,
+        1,
+        return_indices=True,
+    )
+)
+max_pool2d_with_indices = _onnx_symbolic("aten::max_pool2d_with_indices")(
+    _max_pool(
+        "max_pool2d_with_indices",
+        torch.nn.modules.utils._pair,
+        2,
+        return_indices=True,
+    )
+)
+max_pool3d_with_indices = _onnx_symbolic("aten::max_pool3d_with_indices")(
+    _max_pool(
+        "max_pool3d_with_indices",
+        torch.nn.modules.utils._triple,
+        3,
+        return_indices=True,
+    )
+)
+
+
+@_onnx_symbolic(
+    "aten::avg_pool1d",
+    decorate=[
+        symbolic_helper._apply_params("avg_pool1d", torch.nn.modules.utils._single),
+        _export("avg_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::avg_pool2d",
+    decorate=[
+        symbolic_helper._apply_params("avg_pool2d", torch.nn.modules.utils._pair),
+        _export("avg_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::avg_pool3d",
+    decorate=[
+        symbolic_helper._apply_params("avg_pool3d", torch.nn.modules.utils._triple),
+        _export("avg_pool3d"),
+    ],
+)
+def _avg_pool(name, tuple_fn):
+    @symbolic_helper.quantized_args(True)
+    @symbolic_helper.parse_args("v", "is", "is", "is", "i", "i", "none")
+    def symbolic_fn(
+        g,
+        input: _C.Value,
+        kernel_size: Sequence[int],
+        stride: Sequence[int],
+        padding: int | Sequence[int],
+        ceil_mode: int,
+        count_include_pad: int,
+        divisor_override=None,
+    ):
+        if not stride:
+            stride = kernel_size
+        padding = symbolic_helper._avgpool_helper(
+            tuple_fn, padding, kernel_size, stride, divisor_override, name
+        )
+        assert isinstance(padding, tuple)
+        adjusted_padding = padding
+        # Although onnx::AvgPool provides count_include_pad,
+        # The corner case of Average Pooling with ceil_mode on
+        # PyTorch allows sliding window go off bound, which leads to
+        # this accommodation.
+        # More detail on https://github.com/pytorch/pytorch/issues/57178
+        if count_include_pad:
+            input = symbolic_helper._op_with_optional_float_cast(
+                g,
+                "Pad",
+                input,
+                pads_i=((0,) * 2 + padding) * 2,
+                mode_s="constant",
+                value_f=0.0,
+                opset_before=11,
+            )
+            adjusted_padding = (0,) * len(padding)
+        if ceil_mode:
+            padding_ceil = get_pool_ceil_padding(input, kernel_size, stride, padding)
+            adjusted_padding = adjusted_padding + tuple(
+                a + b for (a, b) in zip(padding_ceil, adjusted_padding)
+            )
+        else:
+            adjusted_padding = adjusted_padding * 2
+        output = g.op(
+            "AveragePool",
+            input,
+            kernel_shape_i=tuple_fn(kernel_size),
+            strides_i=tuple_fn(stride),
+            pads_i=adjusted_padding,
+        )
+        return output
+
+    return symbolic_fn
+
+
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool1d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_avg_pool1d", "AveragePool", torch.nn.modules.utils._single
+        ),
+        _export("adaptive_avg_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool2d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_avg_pool2d", "AveragePool", torch.nn.modules.utils._pair
+        ),
+        _export("adaptive_avg_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_avg_pool3d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_avg_pool3d", "AveragePool", torch.nn.modules.utils._triple
+        ),
+        _export("adaptive_avg_pool3d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool1d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_max_pool1d",
+            "MaxPool",
+            torch.nn.modules.utils._single,
+            max_pool1d_with_indices,
+        ),
+        _export("adaptive_max_pool1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool2d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_max_pool2d",
+            "MaxPool",
+            torch.nn.modules.utils._pair,
+            max_pool2d_with_indices,
+        ),
+        _export("adaptive_max_pool2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::adaptive_max_pool3d",
+    decorate=[
+        symbolic_helper._apply_params(
+            "adaptive_max_pool3d",
+            "MaxPool",
+            torch.nn.modules.utils._triple,
+            max_pool3d_with_indices,
+        ),
+        _export("adaptive_max_pool3d"),
+    ],
+)
+def _adaptive_pool(name, type, tuple_fn, fn=None):
+    @symbolic_helper.quantized_args(True, False)
+    def symbolic_fn(g, input, output_size):
+        # _adaptive_pool is supported for cases where output_size is 1 for all dimensions,
+        # by executing a GlobalPool.
+        # It is also supported for cases where the output size is a factor of the input size.
+        # For these cases the stride and kernel size are uniform along all the indices of
+        # the same dimension, which makes it possible to export it to ONNX.
+        # for MaxPool, GlobalMaxPool does not return indices,
+        # so we try using max_poolxd_with_indices, and if it is not possible
+        # (input is not a complete tensor or output size not factor of input size)
+        # then we call GlobalAveragePool and return None for the indices
+        output_size_value = output_size
+        try:
+            output_size = symbolic_helper._parse_arg(output_size, "is")
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            return symbolic_helper._onnx_unsupported(
+                "adaptive pooling, since output_size is not constant.", input
+            )
+        if output_size == [1] * len(output_size) and type == "AveragePool":
+            return g.op("GlobalAveragePool", input)
+        sizes = symbolic_helper._get_tensor_sizes(input)
+        try:
+            dim = sizes[2:]
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            dim = None
+        if dim is None or any(i is None for i in dim):
+            if output_size == [1] * len(output_size):
+                return g.op("GlobalMaxPool", input), None
+            return symbolic_helper._unimplemented(
+                name, "input size not accessible", input
+            )
+        # verify if output size % input size = 0 for all dim
+        mod = [dim[i] % output_size[i] for i in range(0, len(dim))]
+        if mod != [0] * len(mod):
+            if output_size == [1] * len(output_size):
+                return g.op("GlobalMaxPool", input), None
+            return symbolic_helper._unimplemented(
+                name, "output size that are not factor of input size", output_size_value
+            )
+        k = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
+        # call max_poolxd_with_indices to get indices in the output
+        if type == "MaxPool":
+            return fn(g, input, k, k, (0,) * len(dim), (1,) * len(dim), False)
+        output = g.op(type, input, kernel_shape_i=tuple_fn(k), strides_i=tuple_fn(k))
+        return output
+
+    return symbolic_fn
+
+
+def _prepare_onnx_paddings(dim: int, pad):
+    """Generate paddings in ONNX order based on pad in pytorch.
+    Args:
+        dim: the dimension of the tensor.
+        pad: the paddings in pytorch.
+            The order is dim_n_begin, dim_n_end, dim_n-1_begin, dim_n-1_end, ...
+    """
+    # The desired order of paddings is
+    # dim_0_begin, dim_1_begin, ... , dim_0_end, ..., dim_n_end.
+    # n is the dimension of input.
+    # assume zero-dimensions in the beginning
+    paddings = list(pad[:]) + [0] * (dim * 2 - len(pad))
+    # reverse order and collate first beginnings and then ends
+    paddings = paddings[-2::-2] + paddings[-1::-2]
+    return paddings
+
+
+def _convert_padding_node(input):
+    padding = symbolic_helper._maybe_get_const(input, "is")
+    if symbolic_helper._is_value(padding) and symbolic_helper._is_packed_list(padding):
+        input_list = symbolic_helper._unpack_list(padding)
+        try:
+            padding = [
+                symbolic_helper._get_const(v, "i", "padding") for v in input_list
+            ]
+        except Exception:
+            # FIXME(justinchuby): Avoid catching Exception.
+            # Catch a more specific exception instead.
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "Pad", 9, 11, "The sizes of the padding must be constant", input
+            )
+    return padding
+
+
+@_onnx_symbolic("aten::constant_pad_nd")
+def constant_pad_nd(g: jit_utils.GraphContext, input, padding, value):
+    mode = "constant"
+    try:
+        value = symbolic_helper._get_const(value, "f", "value")
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "Pad", 9, 11, "The value for the padding must be constant", value
+        )
+
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, value_f=value, opset_before=11
+    )
+
+
+def _pad_circular(g: jit_utils.GraphContext, input: _C.Value, pad: _C.Value):
+    padding = _convert_padding_node(pad)
+    assert len(padding) % 2 == 0
+    ndim = len(padding) // 2
+
+    cur = input
+    for idx in range(ndim):
+        pad_r = padding[-(2 * idx + 1)]
+        pad_l = padding[-(2 * idx + 2)]
+        tensors = []
+        if pad_l > 0:
+            left = symbolic_helper._slice_helper(
+                g, cur, axes=[2 + idx], starts=[-(pad_l)], ends=[_constants.INT64_MAX]
+            )
+            tensors.append(left)
+
+        if pad_l < 0 or pad_r < 0:
+            start = builtins.max(0, -pad_l)
+            end = -(builtins.max(0, -pad_r))
+            middle = symbolic_helper._slice_helper(
+                g,
+                cur,
+                axes=[2 + idx],
+                starts=[start],
+                ends=[end],
+            )
+            tensors.append(middle)
+        else:
+            tensors.append(cur)
+
+        if pad_r > 0:
+            right = symbolic_helper._slice_helper(
+                g, cur, axes=[2 + idx], starts=[0], ends=[pad_r]
+            )
+            tensors.append(right)
+
+        cur = g.op("Concat", *tensors, axis_i=(2 + idx))
+
+    return cur
+
+
+@_onnx_symbolic("aten::reflection_pad1d")
+@_onnx_symbolic("aten::reflection_pad2d")
+@_onnx_symbolic("aten::reflection_pad3d")
+def reflection_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "reflect"
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
+    )
+
+
+@_onnx_symbolic("aten::replication_pad1d")
+@_onnx_symbolic("aten::replication_pad2d")
+@_onnx_symbolic("aten::replication_pad3d")
+def replication_pad(g: jit_utils.GraphContext, input, padding):
+    mode = "edge"
+    padding = _convert_padding_node(padding)
+    paddings = _prepare_onnx_paddings(symbolic_helper._get_tensor_rank(input), padding)
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Pad", input, pads_i=paddings, mode_s=mode, opset_before=11
+    )
+
+
+@_onnx_symbolic("aten::pad")
+def pad(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    pad: _C.Value,
+    mode: _C.Value,
+    value: _C.Value,
+):
+    mode = symbolic_helper._parse_arg(mode, "s")
+    if mode == "replicate":
+        return replication_pad(g, input, pad)
+    elif mode == "reflect":
+        return reflection_pad(g, input, pad)
+    elif mode == "constant":
+        return constant_pad_nd(g, input, pad, value)
+    elif mode == "circular":
+        return _pad_circular(g, input, pad)
+    else:
+        raise errors.SymbolicValueError(f"Unrecognized padding mode {mode}", input)
+
+
+@_onnx_symbolic(
+    "aten::upsample_nearest1d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_nearest1d", 3, "nearest"),
+        _export("upsample_nearest1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest2d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_nearest2d", 4, "nearest"),
+        _export("upsample_nearest2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_nearest3d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_nearest3d", 5, "nearest"),
+        _export("upsample_nearest3d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_linear1d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_linear1d", 3, "linear"),
+        _export("upsample_linear1d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_bilinear2d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_bilinear2d", 4, "linear"),
+        _export("upsample_bilinear2d"),
+    ],
+)
+@_onnx_symbolic(
+    "aten::upsample_trilinear3d",
+    decorate=[
+        symbolic_helper._apply_params("upsample_trilinear3d", 5, "linear"),
+        _export("upsample_trilinear3d"),
+    ],
+)
+def _interpolate(name: str, dim: int, interpolate_mode: str):
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = symbolic_helper._get_interpolate_attributes(
+            g, interpolate_mode, args
+        )
+        symbolic_helper._interpolate_warning(interpolate_mode)
+        align_corners = symbolic_helper._maybe_get_scalar(align_corners)
+        if align_corners:
+            return symbolic_helper._unimplemented(name, "align_corners == True", input)
+        if scales is None:
+            scales = symbolic_helper._interpolate_size_to_scales(
+                g, input, output_size, dim
+            )
+        return g.op("Upsample", input, scales, mode_s=interpolate_mode)
+
+    return symbolic_fn
+
+
+@_onnx_symbolic("aten::__interpolate")
+def __interpolate(
+    g: jit_utils.GraphContext,
+    input,
+    size,
+    scale_factor,
+    mode,
+    align_corners,
+    recompute_scale_factor,
+    antialias,
+):
+    scales, mode = symbolic_helper._interpolate_get_scales_and_mode(
+        g, input, size, scale_factor, mode, align_corners
+    )
+    return g.op("Upsample", input, scales, mode_s=mode)
+
+
+@_onnx_symbolic("aten::bitwise_not")
+def bitwise_not(g: jit_utils.GraphContext, input):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise Not "
+            "for non-boolean input values",
+            input,
+        )
+    return g.op("Not", input)
+
+
+@_onnx_symbolic("aten::bitwise_or")
+def bitwise_or(g, self, other):
+    if not symbolic_helper._is_bool(self):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values. self: ",
+            self,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values. other: ",
+            other,
+        )
+    return g.op("Or", self, other)
+
+
+def wrap_logical_op_with_cast_to(to_type):
+    def decorator(fn):
+        @functools.wraps(fn)
+        def wrap_with_cast(g, input, other):
+            to_cast_func = globals()[f"_cast_{to_type}"]
+            return fn(g, to_cast_func(g, input, False), to_cast_func(g, other, False))
+
+        return wrap_with_cast
+
+    return decorator
+
+
+def wrap_logical_op_with_negation(func: Callable) -> Callable:
+    @functools.wraps(func)
+    def wrap_with_not(g, input, other):
+        return g.op("Not", func(g, input, other))
+
+    return wrap_with_not
+
+
+@_onnx_symbolic("aten::__not_")
+def __not_(g: jit_utils.GraphContext, self):
+    if not symbolic_helper._is_bool(self):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise Not "
+            "for non-boolean input values",
+            self,
+        )
+    return g.op("Not", self)
+
+
+@_onnx_symbolic("aten::eq")
+@symbolic_helper.quantized_args(True, True)
+def eq(g: jit_utils.GraphContext, self, other):
+    if isinstance(self.type(), _C.DeviceObjType) and isinstance(
+        other.type(), _C.DeviceObjType
+    ):
+        # ONNX doesn't have devices, so consider them all to be equal.
+        # The no-op check for equality will get constant-folded.
+        return g.op("Constant", value_t=torch.tensor(True, dtype=torch.bool))
+    self_node = self.node()
+    other_node = other.node()
+    if self_node.kind() == other_node.kind() == "onnx::Constant":
+        if self_node.kindOf("value") == other_node.kindOf("value") == "s":
+            # Exporting strings to ONNX is not supported.
+            # If both strings are constant, we can compare them directly.
+            # The no-op check for equality will get constant-folded.
+            return g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    self_node.s("value") == other_node.s("value"),
+                    dtype=torch.bool,
+                ),
+            )
+
+    return g.op("Equal", self, other)
+
+
+@_onnx_symbolic("aten::ne")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+def ne(g: jit_utils.GraphContext, self, other):
+    return eq(g, self, other)
+
+
+@_onnx_symbolic("aten::gt")
+@symbolic_helper.quantized_args(True, True)
+def gt(g: jit_utils.GraphContext, input, other):
+    return _gt_impl(g, input, other)
+
+
+def _gt_impl(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("Greater", input, other)
+
+
+@_onnx_symbolic("aten::lt")
+@symbolic_helper.quantized_args(True, True)
+def lt(g: jit_utils.GraphContext, input, other):
+    return _lt_impl(g, input, other)
+
+
+def _lt_impl(g: jit_utils.GraphContext, input, other):
+    if symbolic_helper._is_bool(input) and symbolic_helper._is_bool(other):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("Less", input, other)
+
+
+@_onnx_symbolic("aten::ge")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+def ge(g: jit_utils.GraphContext, input, other):
+    return _lt_impl(g, input, other)
+
+
+@_onnx_symbolic("aten::le")
+@symbolic_helper.quantized_args(True, True)
+@wrap_logical_op_with_negation
+def le(g: jit_utils.GraphContext, input, other):
+    return _gt_impl(g, input, other)
+
+
+@_onnx_symbolic("aten::__and_")
+def __and_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise AND "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise AND "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("And", input, other)
+
+
+@_onnx_symbolic("aten::__or_")
+def __or_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise OR "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("Or", input, other)
+
+
+@_onnx_symbolic("aten::__xor_")
+def __xor_(g: jit_utils.GraphContext, input, other):
+    if not symbolic_helper._is_bool(input):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise XOR "
+            "for non-boolean input values",
+            input,
+        )
+    if not symbolic_helper._is_bool(other):
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting bitwise XOR "
+            "for non-boolean input values",
+            other,
+        )
+    return g.op("Xor", input, other)
+
+
+@_onnx_symbolic("aten::logical_and")
+@wrap_logical_op_with_cast_to("Bool")
+def logical_and(g: jit_utils.GraphContext, input, other):
+    return g.op("And", input, other)
+
+
+@_onnx_symbolic("aten::logical_or")
+@wrap_logical_op_with_cast_to("Bool")
+def logical_or(g: jit_utils.GraphContext, input, other):
+    return g.op("Or", input, other)
+
+
+@_onnx_symbolic("aten::logical_xor")
+@wrap_logical_op_with_cast_to("Bool")
+def logical_xor(g: jit_utils.GraphContext, input, other):
+    return g.op("Xor", input, other)
+
+
+@_onnx_symbolic("aten::logical_not")
+def logical_not(g: jit_utils.GraphContext, input):
+    return g.op("Not", g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL))
+
+
+@_onnx_symbolic("aten::__rshift_")
+def __rshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    self_scalar_type = _type_utils.JitScalarType.from_value(self)
+    if (
+        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
+        != self_scalar_type
+    ):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=self_scalar_type.onnx_type(),
+        )
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=self_scalar_type.onnx_type(),
+    )
+    rshift = g.op("Div", self, two_pow)
+    return rshift
+
+
+@_onnx_symbolic("aten::__lshift_")
+def __lshift_(g: jit_utils.GraphContext, self, other):
+    # make sure to cast other to self's type
+    # (when self is long, make sure that other is not float)
+    self_scalar_type = _type_utils.JitScalarType.from_value(self)
+    if (
+        _type_utils.JitScalarType.from_value(other, _type_utils.JitScalarType.UNDEFINED)
+        != self_scalar_type
+    ):
+        other = g.op(
+            "Cast",
+            other,
+            to_i=self_scalar_type.onnx_type(),
+        )
+
+    two = g.op("Constant", value_t=torch.tensor(2, dtype=torch.float32))
+    # exponent (same type as self) has to be float or double in onnx::Pow
+    if not symbolic_helper._is_fp(self):
+        other = g.op("Cast", other, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    two_pow = g.op("Pow", two, other)
+    two_pow = g.op(
+        "Cast",
+        two_pow,
+        to_i=self_scalar_type.onnx_type(),
+    )
+    lshift = g.op("Mul", self, two_pow)
+    return lshift
+
+
+@_onnx_symbolic("aten::where")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+def where(g: jit_utils.GraphContext, condition, self=None, other=None, _outputs=None):
+    # Assumes that torch.where's first argument takes only Bool and Byte tensors.
+    if not symbolic_helper._is_bool(condition):
+        condition = g.op("Cast", condition, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    if self is None:
+        condition = nonzero(g, condition)
+        return symbolic_helper._unbind_helper(
+            g, condition, g.op("Constant", value_t=torch.tensor(1)), _outputs
+        )
+    return g.op("Where", condition, self, other)
+
+
+@_onnx_symbolic("aten::log_softmax")
+@symbolic_helper.parse_args("v", "i", "none")
+def log_softmax(g: jit_utils.GraphContext, input, dim, dtype=None):
+    # PyTorch dim and ONNX axis have different meanings.
+    # See Softmax comment for details.
+    # TODO: remove this as onnx opset 11 spec allows negative axes
+    input_dim = symbolic_helper._get_tensor_rank(input)
+    if input_dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+        )
+    if dim < 0:
+        dim = input_dim + dim
+    is_transpose_required = input_dim != dim + 1
+    # ONNX only supports log_softmax with dim = -1. Transpose must be added before and after log_softmax to support other cases.
+    if is_transpose_required:
+        axes = list(range(input_dim))
+        axes[dim], axes[-1] = axes[-1], axes[dim]
+        input = g.op("Transpose", input, perm_i=axes)
+        dim = input_dim - 1
+    return_op = g.op("LogSoftmax", input, axis_i=dim)
+    if dtype and dtype.node().kind() != "prim::Constant":
+        parsed_dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        return_op = g.op(
+            "Cast", return_op, to_i=_type_utils.JitScalarType(parsed_dtype).onnx_type()
+        )
+    if is_transpose_required:
+        return_op = g.op("Transpose", return_op, perm_i=axes)  # type: ignore[possibly-undefined]
+    return return_op
+
+
+@_onnx_symbolic("aten::_log_softmax")
+@symbolic_helper.parse_args("v", "i", "i")
+def _log_softmax(g: jit_utils.GraphContext, input, dim, half_to_float):
+    if (
+        half_to_float
+        and _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.UNDEFINED
+        )
+        == _type_utils.JitScalarType.HALF
+    ):
+        input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    return log_softmax(g, input, dim)
+
+
+@_onnx_symbolic("aten::_convolution")
+@symbolic_helper.parse_args(
+    "v", "v", "v", "is", "is", "is", "i", "is", "i", "i", "i", "i", "i"
+)
+def _convolution(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+    benchmark,
+    deterministic,
+    cudnn_enabled,
+    allow_tf32=None,
+):
+    weight_size = symbolic_helper._get_tensor_sizes(weight)
+    try:
+        kernel_shape = weight_size[2:]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        kernel_shape = None
+
+    if kernel_shape is None or any(i is None for i in kernel_shape):
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
+            input,
+        )
+
+    args = [input, weight]
+    # ONNX only supports 1D bias
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) == 1
+    ):
+        args.append(bias)
+
+    kwargs = {
+        "kernel_shape_i": weight_size[2:],
+        "strides_i": stride,
+        # NB: ONNX supports asymmetric padding, whereas PyTorch supports only
+        # symmetric padding
+        "pads_i": padding + padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+    }
+
+    if any(o != 0 for o in output_padding):
+        # ONNX supports both output_shape and output_padding. they are equivalent expressive.
+        # output_padding is more straightforward, so we use it here.
+        # output_shape = stride * (input_shape - 1) + output_padding + kernel_shape - padding * 2
+        assert transposed
+        assert len(stride) == len(output_padding)
+        kwargs["output_padding_i"] = output_padding
+
+    n = g.op("ConvTranspose" if transposed else "Conv", *args, **kwargs)
+
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) != 1
+    ):
+        return g.op("Add", n, bias)
+    else:
+        return n
+
+
+@_onnx_symbolic("aten::_convolution_mode")
+@symbolic_helper.parse_args(
+    "v",
+    "v",
+    "v",
+    "is",
+    "s",
+    "is",
+    "i",
+)
+def _convolution_mode(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    groups,
+):
+    weight_size = symbolic_helper._get_tensor_sizes(weight)
+    try:
+        kernel_shape = weight_size[2:]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        kernel_shape = None
+
+    if kernel_shape is None or any(i is None for i in kernel_shape):
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of convolution for kernel of unknown shape.",
+            input,
+        )
+
+    args = [input, weight]
+    # ONNX only supports 1D bias
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) == 1
+    ):
+        args.append(bias)
+
+    if padding == "valid":
+        padding = "VALID"
+    elif padding == "same":
+        padding = "SAME_UPPER"
+    kwargs = {
+        "kernel_shape_i": weight_size[2:],
+        "strides_i": stride,
+        "auto_pad_s": padding,
+        "dilations_i": dilation,
+        "group_i": groups,
+    }
+
+    n = g.op("Conv", *args, **kwargs)
+
+    if (
+        not symbolic_helper._is_none(bias)
+        and symbolic_helper._get_tensor_rank(bias) != 1
+    ):
+        return g.op("Add", n, bias)
+    else:
+        return n
+
+
+@_onnx_symbolic("aten::convolution")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is", "i")
+def convolution(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    dilation,
+    transposed,
+    output_padding,
+    groups,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        transposed,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv1d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+def conv1d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv2d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+def conv2d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv3d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "v", "is", "i")
+def conv3d(
+    g: jit_utils.GraphContext, input, weight, bias, stride, padding, dilation, groups
+):
+    str_padding = symbolic_helper._parse_arg(padding, "s")
+    if str_padding in ["valid", "same"]:
+        return _convolution_mode(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            str_padding,
+            dilation,
+            groups,
+        )
+    else:
+        padding = symbolic_helper._parse_arg(padding, "is")
+        return _convolution(
+            g,
+            input,
+            weight,
+            bias,
+            stride,
+            padding,
+            dilation,
+            False,
+            (),
+            groups,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+@_onnx_symbolic("aten::conv_transpose1d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+def conv_transpose1d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv_transpose2d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+def conv_transpose2d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::conv_transpose3d")
+@symbolic_helper.parse_args("v", "v", "v", "is", "is", "is", "i", "is")
+def conv_transpose3d(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    stride,
+    padding,
+    output_padding,
+    groups,
+    dilation,
+):
+    return _convolution(
+        g,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        True,
+        output_padding,
+        groups,
+        None,
+        None,
+        None,
+        None,
+    )
+
+
+@_onnx_symbolic("aten::batch_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "i", "f", "f", "i")
+def batch_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    training,
+    momentum,
+    eps,
+    cudnn_enabled,
+):
+    symbolic_helper.check_training_mode(training, "batch_norm")
+
+    if (
+        torch.is_autocast_enabled()
+        and not symbolic_helper.args_have_same_dtype(
+            [input, weight, bias, running_mean, running_var]
+        )
+        and GLOBALS.export_onnx_opset_version < 15
+    ):
+        return symbolic_helper._onnx_opset_unsupported_detailed(
+            "BatchNormalization",
+            9,
+            15,
+            "All input tensors must have the same `dtype`."
+            " Turn off Autocast or export using opset version 15.",
+            input,
+        )
+
+    weight, bias, running_mean, running_var = symbolic_helper._batchnorm_helper(
+        g, input, weight, bias, running_mean, running_var
+    )
+    out = g.op(
+        "BatchNormalization",
+        input,
+        weight,
+        bias,
+        running_mean,
+        running_var,
+        epsilon_f=eps,
+        momentum_f=1 - momentum,
+        outputs=1 if not training else 5,
+    )
+    if not training:
+        return out
+    else:
+        res, new_running_mean, new_running_var, saved_mean, saved_var = out
+        new_running_mean.setType(running_mean.type())
+        new_running_var.setType(running_var.type())
+        saved_mean.setDebugName("batch_norm_dead_output-" + saved_mean.debugName())
+        saved_var.setDebugName("batch_norm_dead_output-" + saved_var.debugName())
+        return res
+
+
+@_onnx_symbolic("aten::native_layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f")
+def native_layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+) -> tuple[_C.Value, _C.Value, _C.Value]:
+    axes = [-i for i in range(len(normalized_shape), 0, -1)]
+
+    two_cst = symbolic_helper._generate_wrapped_number(g, 2.0)
+    eps_cst = symbolic_helper._generate_wrapped_number(g, eps)
+
+    if g.opset < 18:
+        mean = g.op("ReduceMean", input, axes_i=axes)
+    else:
+        mean = g.op(
+            "ReduceMean",
+            input,
+            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
+        )
+
+    numerator = sub(g, input, mean)
+
+    # Cast it to eps dtype to avoid precision loss
+    is_type_half = (
+        _type_utils.JitScalarType.from_value(numerator)
+        == _type_utils.JitScalarType.HALF
+    )
+    if is_type_half:
+        eps_dtype = _type_utils.JitScalarType.from_value(eps_cst)
+        numerator = g.op(
+            "Cast", numerator, to_i=_type_utils.JitScalarType(eps_dtype).onnx_type()
+        )
+
+    # variance = e((x - e(x))^2), and (x - e(x)) is the numerator in the layer_norm formula
+    if g.opset < 18:
+        variance = g.op("ReduceMean", pow(g, numerator, two_cst), axes_i=axes)
+    else:
+        variance = g.op(
+            "ReduceMean",
+            pow(g, numerator, two_cst),
+            g.op("Constant", value_t=torch.tensor(axes, dtype=torch.long)),
+        )
+
+    denominator = sqrt(g, g.op("Add", variance, eps_cst))
+    normalized = g.op("Div", numerator, denominator)
+
+    # Cast back to input type as eps related ops are all done
+    if is_type_half:
+        input_dtype = _type_utils.JitScalarType.from_value(input)
+        normalized = g.op(
+            "Cast", normalized, to_i=_type_utils.JitScalarType(input_dtype).onnx_type()
+        )
+
+    if not (weight is None or symbolic_helper._is_none(weight)):
+        normalized = mul(g, normalized, weight)
+    if not (bias is None or symbolic_helper._is_none(bias)):
+        normalized = add(g, normalized, bias)
+
+    # rdenominator := 1 / sqrt(variance + eps)
+    # According to aten::native_layer_norm, rdenominator should have the same dtype as input,
+    # mean and normalized, so we need to Cast it back
+    if is_type_half:
+        denominator = g.op(
+            "Cast",
+            denominator,
+            to_i=_type_utils.JitScalarType(input_dtype).onnx_type(),  # type: ignore[possibly-undefined]
+        )
+        rdenominator = g.op("Reciprocal", denominator)
+    else:
+        rdenominator = reciprocal(g, denominator)
+
+    return normalized, mean, rdenominator
+
+
+@_onnx_symbolic("aten::layer_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "is", "v", "v", "f", "b")
+def layer_norm(
+    g: jit_utils.GraphContext,
+    input: _C.Value,
+    normalized_shape: Sequence[int],
+    weight: _C.Value,
+    bias: _C.Value,
+    eps: float,
+    cudnn_enable: bool,
+) -> _C.Value:
+    normalized, _, _ = native_layer_norm(g, input, normalized_shape, weight, bias, eps)
+    return normalized
+
+
+@_onnx_symbolic("aten::instance_norm")
+@symbolic_helper.parse_args("v", "v", "v", "v", "v", "b", "f", "f", "b")
+def instance_norm(
+    g: jit_utils.GraphContext,
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    use_input_stats: bool,
+    momentum: Number,
+    eps: Number,
+    cudnn_enabled: bool,
+):
+    symbolic_helper.check_training_mode(use_input_stats, "instance_norm")
+    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
+    if weight is None or symbolic_helper._is_none(weight):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm for unknown channel size.",
+                input,
+            )
+        weight_value = torch.tensor(
+            [1.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or symbolic_helper._is_none(bias):
+        if channel_size is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm for unknown channel size.",
+                input,
+            )
+        bias_value = torch.tensor(
+            [0.0] * channel_size,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        )
+        bias = g.op("Constant", value_t=bias_value)
+    if (
+        running_mean is None
+        or symbolic_helper._is_none(running_mean)
+        or running_var is None
+        or symbolic_helper._is_none(running_var)
+    ):
+        return g.op("InstanceNormalization", input, weight, bias, epsilon_f=eps)
+    else:
+        input_size = symbolic_helper._get_tensor_sizes(input)
+        # If input shape is [N, C, H, W], reshape to [1, N * C, H, W] and call batch_norm.
+        # For more information instance_norm():
+        # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/Normalization.cpp#L542
+        input_size_reshape = input_size.copy()
+        n = input_size[0]
+        if n is None:
+            raise errors.SymbolicValueError(
+                "Unsupported: ONNX export of instance_norm training for unknown "
+                "batch size.",
+                input,
+            )
+        c = input_size[1]
+        input_size_reshape[0] = 1
+        input_size_reshape[1] = n * c
+        weight_ = repeat(
+            g, weight, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
+        )
+        bias_ = repeat(
+            g, bias, g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64))
+        )
+        running_mean_ = repeat(
+            g,
+            running_mean,
+            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
+        )
+        running_var_ = repeat(
+            g,
+            running_var,
+            g.op("Constant", value_t=torch.tensor([n], dtype=torch.int64)),
+        )
+        input_reshaped = g.op(
+            "Reshape",
+            input,
+            g.op("Constant", value_t=torch.LongTensor(input_size_reshape)),
+        )
+        out = batch_norm(
+            g,
+            input_reshaped,
+            weight_,
+            bias_,
+            running_mean_,
+            running_var_,
+            use_input_stats,
+            momentum,
+            eps,
+            cudnn_enabled,
+        )
+        return view(g, out, g.op("Constant", value_t=torch.tensor(input_size)))
+
+
+@_onnx_symbolic("aten::unfold")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def unfold(g: jit_utils.GraphContext, input, dimension, size, step):
+    sizes = symbolic_helper._get_tensor_sizes(input)
+    # FIXME(justinchuby): Get rid of the try catch here to improve readability
+    try:
+        sizedim = sizes[dimension]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        sizedim = None
+    if sizedim is not None:
+        low_indices = range(0, sizedim, step)
+        hi_indices = range(size, sizedim + 1, step)
+        stack = [
+            symbolic_helper._slice_helper(
+                g, input, axes=[dimension], starts=[low], ends=[hi]
+            )
+            for low, hi in zip(low_indices, hi_indices)
+        ]
+        ndim = len(sizes)
+        perm = list(range(0, ndim))
+        perm.append(perm.pop(dimension))
+        unsqueeze = [
+            symbolic_helper._unsqueeze_helper(
+                g, g.op("Transpose", t, perm_i=perm), [dimension]
+            )
+            for t in stack
+        ]
+        return g.op("Concat", *unsqueeze, axis_i=dimension)
+    else:
+        return symbolic_helper._unimplemented(
+            "Unfold", "input size not accessible", input
+        )
+
+
+@_onnx_symbolic("aten::elu")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "t", "t", "t")
+def elu(g: jit_utils.GraphContext, input, alpha, scale, input_scale):
+    if scale and scale != 1.0:
+        return symbolic_helper._unimplemented(
+            "scale", "does not support scale in Elu", scale
+        )
+    if input_scale and input_scale != 1.0:
+        return symbolic_helper._unimplemented(
+            "input_scale", "does not support input_scale in Elu", input_scale
+        )
+    # See Note [Export inplace]
+    return g.op("Elu", input, alpha_f=symbolic_helper._scalar(alpha))
+
+
+@_onnx_symbolic("aten::selu")
+@symbolic_helper.quantized_args(True)
+def selu(g: jit_utils.GraphContext, input):
+    return g.op("Selu", input)
+
+
+@_onnx_symbolic("aten::index_select")
+@symbolic_helper.parse_args("v", "i", "v")
+def index_select(g: jit_utils.GraphContext, self, dim, index):
+    # In case of a scalar index, index_select returns a tensor with the same rank as the input.
+    # To match this behavior in ONNX, we make index a 1D tensor so that the following gather
+    # also produces a tensor with the same rank as the input.
+    return symbolic_helper._select_helper(g, self, dim, index)
+
+
+@_onnx_symbolic("aten::index_put")
+def index_put(g: jit_utils.GraphContext, self, indices_list_value, values, accumulate):
+    if symbolic_helper._is_packed_list(indices_list_value):
+        indices_list = symbolic_helper._unpack_list(indices_list_value)
+    else:
+        indices_list = [indices_list_value]
+
+    accumulate = symbolic_helper._parse_arg(accumulate, "b")
+
+    if len(indices_list) == 0:
+        if accumulate:
+            return add(g, self, values)
+        return values
+    symbolic_helper._onnx_opset_unsupported("index_put", 9, 11, self)
+
+
+@_onnx_symbolic("aten::index_fill")
+def index_fill(g: jit_utils.GraphContext, self, dim, index, value):
+    dim_value = symbolic_helper._parse_arg(dim, "i")
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    value = symbolic_helper._maybe_get_scalar(value)
+    value = symbolic_helper._if_scalar_type_as(value, self)
+    expanded_value = expand(g, value, expanded_index_shape, None)
+
+    return scatter(g, self, dim, expanded_index, expanded_value)
+
+
+@_onnx_symbolic("aten::index_copy")
+def index_copy(g: jit_utils.GraphContext, self, dim, index, source):
+    dim_value = symbolic_helper._parse_arg(dim, "i")
+    expanded_index_shape, expanded_index = symbolic_helper._index_fill_reshape_helper(
+        g, self, dim, index
+    )
+    return scatter(g, self, dim, expanded_index, source)
+
+
+@_onnx_symbolic("aten::bucketize")
+@symbolic_helper.parse_args("v", "v", "b", "b")
+def bucketize(
+    g: jit_utils.GraphContext, self, boundaries, out_int32=False, right=False
+):
+    out_type = _C_onnx.TensorProtoDataType.INT64
+    if out_int32:
+        out_type = _C_onnx.TensorProtoDataType.INT32
+    # A tensor expanded_boundaries is created such that it
+    # contains a copy of boundaries for each element of self.
+    new_shape = g.op("Concat", g.op("Shape", boundaries), g.op("Shape", self), axis_i=0)
+    # Unsqueeze step is performed to respect ONNX's numpy style broadcasting for comparison ops
+    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    tensor_rank = symbolic_helper._get_tensor_rank(self)
+    assert tensor_rank is not None
+    unsqueeze_axes = list(range(1, tensor_rank + 1))
+    expanded_boundaries = expand(
+        g,
+        symbolic_helper._unsqueeze_helper(g, boundaries, unsqueeze_axes),
+        new_shape,
+        None,
+    )
+    # Compare each element of self to boundaries to get a tensor
+    # with leading 1s and trailing 0s.
+    # e.g., 4 > [1, 3, 4] = [1, 1, 0]
+    # The index of the last 1 is the bucket where the element should go.
+    if right:
+        cond = ge(g, self, expanded_boundaries)
+    else:
+        cond = gt(g, self, expanded_boundaries)
+    cond_out = g.op("Cast", cond, to_i=out_type)
+    # Sum to get the number of 1s corresponding to each element,
+    # which is the same as the bucket index.
+    # e.g., sum(4 > [1, 3, 4]) = sum([1, 1, 0]) = 2
+    return symbolic_helper._reducesum_helper(g, cond_out, axes_i=[0], keepdims_i=0)
+
+
+@_onnx_symbolic("aten::type_as")
+def type_as(g: jit_utils.GraphContext, self, other):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    other_dtype = symbolic_helper._try_get_scalar_type(other)
+    if self_dtype == other_dtype and self_dtype is not None:
+        return self
+    if other_dtype is not None:
+        return g.op(
+            "Cast",
+            self,
+            to_i=other_dtype.onnx_type(),
+        )
+
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of type_as for tensor "
+        "of unknown dtype. Please check if the dtype of the "
+        "parameter passed to the type_as function is correct.",
+        other,
+    )
+
+
+@_onnx_symbolic("aten::cosine_similarity")
+@symbolic_helper.parse_args("v", "v", "i", "f")
+def cosine_similarity(g: jit_utils.GraphContext, x1, x2, dim, eps):
+    cross = symbolic_helper._reducesum_helper(
+        g, mul(g, x1, x2), axes_i=[dim], keepdims_i=0
+    )
+    x1_l2 = symbolic_helper._reducesum_helper(
+        g, mul(g, x1, x1), axes_i=[dim], keepdims_i=0
+    )
+    x2_l2 = symbolic_helper._reducesum_helper(
+        g, mul(g, x2, x2), axes_i=[dim], keepdims_i=0
+    )
+    div_tens = max(
+        g, sqrt(g, mul(g, x1_l2, x2_l2)), g.op("Constant", value_t=torch.tensor([eps]))
+    )
+    return div(g, cross, div_tens)
+
+
+@_onnx_symbolic("aten::pairwise_distance")
+def pairwise_distance(g: jit_utils.GraphContext, input1, input2, p, eps, keepdim):
+    if not symbolic_helper._is_value(eps):
+        eps = g.op("Constant", value_t=torch.tensor([eps]))
+    inv_p = div(
+        g,
+        g.op("Constant", value_t=torch.tensor([1], dtype=torch.float)),
+        add(g, p, eps),
+    )
+    summation = symbolic_helper._reducesum_helper(
+        g,
+        pow(g, sub(g, input1, input2), p),
+        axes_i=[-1],
+        keepdims_i=symbolic_helper._parse_arg(keepdim, "i"),
+    )
+    return pow(g, summation, inv_p)
+
+
+@_onnx_symbolic("aten::clone")
+# ignore clone operators that are inserted by PyTorch autograd
+def clone(g: jit_utils.GraphContext, input, unused_memory_format):
+    return input
+
+
+@_onnx_symbolic("aten::abs")
+def abs(g: jit_utils.GraphContext, self):
+    return g.op("Abs", self)
+
+
+@_onnx_symbolic("aten::log")
+def log(g: jit_utils.GraphContext, self):
+    return g.op("Log", self)
+
+
+@_onnx_symbolic("aten::log1p")
+def log1p(g: jit_utils.GraphContext, self):
+    return log(g, add(g, symbolic_helper._if_scalar_type_as(torch.ones(1), self), self))
+
+
+@_onnx_symbolic("aten::log10")
+def log10(g: jit_utils.GraphContext, self):
+    _ln10 = 2.30258509299404568401
+    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor([_ln10])))
+
+
+@_onnx_symbolic("aten::pow")
+def pow(g: jit_utils.GraphContext, self, exponent):
+    f_dtype = _type_utils.JitScalarType.from_value(self)
+    if not symbolic_helper._is_fp(self):
+        f_dtype = _type_utils.JitScalarType.FLOAT
+        self = g.op("Cast", self, to_i=f_dtype.onnx_type())
+    if not symbolic_helper._is_fp(exponent):
+        exponent = g.op(
+            "Cast",
+            exponent,
+            to_i=f_dtype.onnx_type(),
+        )
+    pow = g.op("Pow", self, exponent)
+    return pow
+
+
+@_onnx_symbolic("aten::clamp")
+def clamp(g: jit_utils.GraphContext, self, min, max):
+    # min or max may be None that we need to dispatch to
+    # Clip separately, as ONNX does not have None syntax
+    if symbolic_helper._is_none(min):
+        return clamp_max(g, self, max)
+    elif symbolic_helper._is_none(max):
+        return clamp_min(g, self, min)
+    else:
+        if symbolic_helper._is_constant(min) and symbolic_helper._is_constant(max):
+            return symbolic_helper._op_with_optional_float_cast(
+                g,
+                "Clip",
+                self,
+                min_f=symbolic_helper._parse_arg(min, "f"),
+                max_f=symbolic_helper._parse_arg(max, "f"),
+                opset_before=12,
+            )
+        else:
+            return clamp_max(g, clamp_min(g, self, min), max)
+
+
+@_onnx_symbolic("aten::clamp_min")
+@symbolic_helper.parse_args("v", "v")
+def clamp_min(g: jit_utils.GraphContext, self, min):
+    if symbolic_helper._is_constant(min):
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, min_f=symbolic_helper._parse_arg(min, "f"), opset_before=12
+        )
+    else:
+        dtype = _type_utils.JitScalarType.from_value(self)
+        min = g.op("Cast", min, to_i=dtype.onnx_type())
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Max", self, min, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::clamp_max")
+@symbolic_helper.parse_args("v", "v")
+def clamp_max(g: jit_utils.GraphContext, self, max):
+    if symbolic_helper._is_constant(max):
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Clip", self, max_f=symbolic_helper._parse_arg(max, "f"), opset_before=12
+        )
+    else:
+        dtype = _type_utils.JitScalarType.from_value(self)
+        max = g.op("Cast", max, to_i=dtype.onnx_type())
+        return symbolic_helper._op_with_optional_float_cast(
+            g, "Min", self, max, opset_before=12
+        )
+
+
+@_onnx_symbolic("aten::max")
+# torch.max (same for torch.min) actually has two interfaces smashed together:
+# torch.max(x, dim, keepdim) and torch.max(x, y)
+# TODO(justinchuby): Support multiple quantized args in output
+def max(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._max_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::maximum")
+@symbolic_helper.quantized_args(True, True)
+def maximum(g: jit_utils.GraphContext, input, other):
+    return max(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::min")
+# TODO(justinchuby): Support multiple quantized args in output
+def min(g: jit_utils.GraphContext, self, dim_or_y=None, keepdim=None):
+    return symbolic_helper._min_helper(g, self, dim_or_y, keepdim)
+
+
+@_onnx_symbolic("aten::minimum")
+@symbolic_helper.quantized_args(True, True)
+def minimum(g: jit_utils.GraphContext, input, other):
+    return min(g, input, dim_or_y=other)
+
+
+@_onnx_symbolic("aten::amax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amax(g: jit_utils.GraphContext, self, dim, keepdim):
+    return g.op("ReduceMax", self, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::amin")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "is", "i")
+def amin(g: jit_utils.GraphContext, self, dim, keepdim):
+    return g.op("ReduceMin", self, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::aminmax")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "i")
+def aminmax(g: jit_utils.GraphContext, self, dim, keepdim):
+    reduce_kwargs = {"keepdims_i": keepdim}
+    if not symbolic_helper._is_none(dim):
+        dim = symbolic_helper._get_const(dim, "i", "dim")
+        reduce_kwargs["axes_i"] = [dim]
+
+    return g.op("ReduceMin", self, **reduce_kwargs), g.op(
+        "ReduceMax", self, **reduce_kwargs
+    )
+
+
+@_onnx_symbolic("aten::exp")
+def exp(g: jit_utils.GraphContext, self):
+    return g.op("Exp", self)
+
+
+@_onnx_symbolic("aten::dropout_")
+@_onnx_symbolic("aten::dropout")
+@symbolic_helper.parse_args("v", "f", "i")
+def dropout(g: jit_utils.GraphContext, input, p, train):
+    symbolic_helper.check_training_mode(train, "dropout")
+    # if train is False, dropout is no-op
+    if not train:
+        return input
+    r, _ = g.op("Dropout", input, ratio_f=p, outputs=2)
+    return r
+
+
+@_onnx_symbolic(
+    "aten::alpha_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::alpha_dropout_")],
+)  # See Note [Export inplace]
+@_onnx_symbolic(
+    "aten::feature_alpha_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout_")],
+)
+@_onnx_symbolic(
+    "aten::feature_dropout_",
+    decorate=[symbolic_helper._apply_params("aten::feature_dropout_")],
+)
+@_onnx_symbolic(
+    "aten::feature_alpha_dropout",
+    decorate=[symbolic_helper._apply_params("aten::feature_alpha_dropout")],
+)
+@_onnx_symbolic(
+    "aten::alpha_dropout",
+    decorate=[symbolic_helper._apply_params("aten::alpha_dropout")],
+)
+@_onnx_symbolic(
+    "aten::feature_dropout",
+    decorate=[symbolic_helper._apply_params("aten::feature_dropout")],
+)
+def _unsupported_dropout(name: str):
+    @symbolic_helper.parse_args("v", "none", "b")
+    def feature_dropout(g, input, p, train):
+        # NB: In inference mode, FeatureDropout is exported as an identity op.
+        if train:
+            return symbolic_helper._unimplemented(name, "training mode", input)
+        return input
+
+    return feature_dropout
+
+
+@_onnx_symbolic("aten::norm")
+@symbolic_helper.parse_args("v", "t", "is", "i", "v")
+def norm(g: jit_utils.GraphContext, self, p, dim, keepdim, dtype=None):
+    if p == 1:
+        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL1")
+    elif p == 2:
+        f = symbolic_helper._reduce_op_symbolic_helper("ReduceL2")
+    else:
+        raise errors.SymbolicValueError(
+            "ONNX export only p-norms with p of 1 or 2", self
+        )
+    result = f(g, self, dim=dim, keepdim=keepdim)
+    if dtype is not None:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        result = g.op("Cast", result, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    return result
+
+
+@_onnx_symbolic("aten::conv_tbc")
+@symbolic_helper.parse_args("v", "v", "v", "i")
+def conv_tbc(g: jit_utils.GraphContext, input, weight, bias, pad):
+    # input must have 3 dimensions, see:
+    # https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ConvolutionTBC.cpp#L8-L10
+    # input = (time, batch, in_channels)
+    # weight = (kernel_width, in_channels, out_channels)
+    # bias = (out_channels,)
+    input = g.op("Transpose", input, perm_i=[1, 2, 0])
+    weight = g.op("Transpose", weight, perm_i=[2, 1, 0])
+    conv = conv1d(g, input, weight, bias, [1], [pad], [1], 1)
+    return g.op("Transpose", conv, perm_i=[2, 0, 1])
+
+
+@_onnx_symbolic("aten::_unique")
+@symbolic_helper.parse_args("v", "i", "i")
+def _unique(g: jit_utils.GraphContext, input, sorted, return_inverse):
+    return symbolic_helper._onnx_unsupported("_unique", input)
+
+
+@_onnx_symbolic("aten::_unique2")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def _unique2(g: jit_utils.GraphContext, input, sorted, return_inverse, return_counts):
+    symbolic_helper._onnx_opset_unsupported("_unique2", 9, 11, input)
+
+
+@_onnx_symbolic("aten::_cast_Byte")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+def _cast_Byte(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.UINT8)
+
+
+@_onnx_symbolic("aten::_cast_Char")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+def _cast_Char(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT8)
+
+
+@_onnx_symbolic("aten::_cast_Short")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+def _cast_Short(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT16)
+
+
+@_onnx_symbolic("aten::_cast_Int")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+def _cast_Int(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT32)
+
+
+@_onnx_symbolic("aten::_cast_Long")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+def _cast_Long(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+
+@_onnx_symbolic("aten::_cast_Half")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+def _cast_Half(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT16)
+
+
+@_onnx_symbolic("aten::_cast_Float")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+def _cast_Float(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+
+
+@_onnx_symbolic("aten::_cast_Double")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+def _cast_Double(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.DOUBLE)
+
+
+@_onnx_symbolic("aten::_cast_Bool")
+@_deprecation.deprecated(
+    "2.0",
+    "the future",
+    "Avoid using this function and create a Cast node instead",
+)
+def _cast_Bool(g: jit_utils.GraphContext, input, non_blocking):
+    return g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.BOOL)
+
+
+@_onnx_symbolic("aten::empty")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty(
+    g: jit_utils.GraphContext,
+    sizes,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::empty_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def empty_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    return zeros_like(g, input, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::new_empty")
+def new_empty(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return empty(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::scalar_tensor")
+def scalar_tensor(g: jit_utils.GraphContext, scalar, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        dtype = _type_utils.JitScalarType.FLOAT
+    scalar = g.op("Cast", scalar, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    return scalar
+
+
+@_onnx_symbolic("aten::tensor")
+def tensor(
+    g: jit_utils.GraphContext, data, dtype=None, device=None, requires_grad=False
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if symbolic_helper._is_packed_list(data):
+        if dtype is None:
+            dtype = _type_utils.JitScalarType.from_value(
+                symbolic_helper._unpack_list(data)[0]
+            )
+        input_list = []
+        for t in symbolic_helper._unpack_list(data):
+            shape_reference = g.op("Constant", value_t=torch.LongTensor([1]))
+            t = symbolic_helper._reshape_helper(g, t, shape_reference)
+            t = g.op("Cast", t, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+            input_list.append(t)
+        return g.op("Concat", *input_list, axis_i=0)
+    else:
+        if dtype is None:
+            dtype = _type_utils.JitScalarType.from_value(data)
+        if symbolic_helper._is_list(data) and (
+            symbolic_helper._is_tensor_list(data)
+            or symbolic_helper._is_scalar_list(data)
+        ):
+            data = g.op("ConcatFromSequence", data, axis_i=0, new_axis_i=1)
+    return g.op("Cast", data, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+
+@_onnx_symbolic("aten::as_tensor")
+def as_tensor(g: jit_utils.GraphContext, data, dtype=None, device=None):
+    return tensor(g, data, dtype, device)
+
+
+@_onnx_symbolic("aten::zeros")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def zeros(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    # NOTE: no way to set device, layout and pin_memory in ONNX, so we ignore it
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+    if isinstance(sizes_, list) and len(sizes_) == 0:
+        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+    return g.op(
+        "ConstantOfShape",
+        sizes,
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::zeros_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def zeros_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    if symbolic_helper._is_none(dtype):
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op(
+        "ConstantOfShape",
+        shape,
+        value_t=torch.tensor([0], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::new_zeros")
+def new_zeros(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return zeros(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::zero")
+def zero(g: jit_utils.GraphContext, self):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    return zeros_like(g, self, self_dtype)
+
+
+@_onnx_symbolic("aten::ones")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v")
+def ones(g: jit_utils.GraphContext, sizes, dtype, layout, device, pin_memory=False):
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+    if isinstance(sizes_, list) and len(sizes_) == 0:
+        sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+    return g.op(
+        "ConstantOfShape",
+        sizes,
+        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::ones_like")
+@symbolic_helper.parse_args("v", "i", "v", "v", "v", "v")
+def ones_like(
+    g: jit_utils.GraphContext,
+    input,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    shape = g.op("Shape", input)
+    if symbolic_helper._is_none(dtype):
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op(
+        "ConstantOfShape",
+        shape,
+        value_t=torch.tensor([1], dtype=scalar_type.dtype()),
+    )
+
+
+@_onnx_symbolic("aten::new_ones")
+def new_ones(
+    g: jit_utils.GraphContext, self, sizes, dtype, layout, device, pin_memory=False
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return ones(g, sizes, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::full")
+def full(
+    g: jit_utils.GraphContext, sizes, value, dtype, layout, device, pin_memory=False
+):
+    const_value = symbolic_helper._maybe_get_const(value, "t")
+    if symbolic_helper._is_value(const_value):
+        dtype = _type_utils.JitScalarType.FLOAT if dtype is None else dtype
+        tmp = zeros(g, sizes, dtype, layout, device)
+        return add(g, tmp, value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+        if dtype is None:
+            scalar_type = _type_utils.JitScalarType.FLOAT
+        else:
+            scalar_type = _type_utils.JitScalarType(dtype)
+        sizes_ = symbolic_helper._maybe_get_const(sizes, "is")
+        if isinstance(sizes_, list) and len(sizes_) == 0:
+            sizes = g.op("Constant", value_t=torch.tensor([]).to(torch.int64))
+        return g.op(
+            "ConstantOfShape",
+            sizes,
+            value_t=const_value.view(1).to(scalar_type.dtype()),
+        )
+
+
+@_onnx_symbolic("aten::full_like")
+def full_like(
+    g: jit_utils.GraphContext,
+    input,
+    fill_value,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    fill_value = symbolic_helper._maybe_get_const(fill_value, "f")
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            input, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if symbolic_helper._is_value(fill_value):
+        tmp = zeros_like(g, input, dtype, layout, device)
+        fill_value = g.op("Cast", fill_value, to_i=scalar_type.onnx_type())
+        return add(g, tmp, fill_value, g.op("Constant", value_t=torch.tensor(1)))
+    else:
+        shape = g.op("Shape", input)
+        return g.op(
+            "ConstantOfShape",
+            shape,
+            value_t=torch.tensor([fill_value], dtype=scalar_type.dtype()),
+        )
+
+
+@_onnx_symbolic("aten::new_full")
+def new_full(
+    g: jit_utils.GraphContext,
+    self,
+    size,
+    fill_value,
+    dtype,
+    layout,
+    device,
+    pin_memory=False,
+):
+    self_dtype = symbolic_helper._try_get_scalar_type(self)
+    if symbolic_helper._is_none(dtype) and self_dtype is not None:
+        dtype = self_dtype
+    return full(g, size, fill_value, dtype, layout, device, pin_memory)
+
+
+@_onnx_symbolic("aten::eye")
+def eye(g: jit_utils.GraphContext, *args):
+    if len(args) == 5:
+        # aten::eye(n, dtype, layout, device, pin_memory)
+        n, dtype, layout, device, pin_memory = args
+        dim_size = symbolic_helper._unsqueeze_helper(g, n, [0])
+        shape = g.op("Concat", dim_size, dim_size, axis_i=0)
+        tensor = zeros(g, shape, dtype, layout, device)
+        return g.op("EyeLike", tensor)
+    if len(args) == 6:
+        # aten::eye(n, m, dtype, layout, device, pin_memory)
+        n, m, dtype, layout, device, pin_memory = args
+        shape = g.op(
+            "Concat",
+            symbolic_helper._unsqueeze_helper(g, n, [0]),
+            symbolic_helper._unsqueeze_helper(g, m, [0]),
+            axis_i=0,
+        )
+        tensor = zeros(g, shape, dtype, layout, device)
+        return g.op("EyeLike", tensor)
+
+    return symbolic_helper._unimplemented("aten::eye", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::slice")
+def slice(g: jit_utils.GraphContext, self, *args):
+    if len(args) == 4:
+        # aten::slice(Tensor self, int dim, int start, int end, int step) -> Tensor
+        dim, start, end, step = args
+        step = symbolic_helper._parse_arg(step, "i")
+        if step != 1:
+            raise errors.SymbolicValueError("step!=1 is currently not supported", self)
+        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
+            start.type(), _C.NoneType
+        )
+        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
+            end.type(), _C.NoneType
+        )
+        is_start_onnx_const = start.node().kind() == "onnx::Constant"
+        is_end_onnx_const = end.node().kind() == "onnx::Constant"
+        if (
+            ((not is_start_none) and (not is_start_onnx_const))
+            or ((not is_end_none) and (not is_end_onnx_const))
+            or dim.node().kind() != "onnx::Constant"
+        ):
+            if GLOBALS.operator_export_type == _C_onnx.OperatorExportTypes.ONNX:
+                raise errors.SymbolicValueError(
+                    "Unsupported: ONNX export of Slice with dynamic inputs. DynamicSlice "
+                    "is a deprecated experimental op. Please use statically allocated "
+                    "variables or export to a higher opset version.",
+                    self,
+                )
+            else:
+                start_unsqueezed = symbolic_helper._unsqueeze_helper(g, start, [0])
+                end_unsqueezed = symbolic_helper._unsqueeze_helper(g, end, [0])
+                dim_unsqueezed = symbolic_helper._unsqueeze_helper(g, dim, [0])
+                return g.op(
+                    "DynamicSlice",
+                    self,
+                    start_unsqueezed,
+                    end_unsqueezed,
+                    dim_unsqueezed,
+                )
+        else:
+            start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
+            end = (
+                _constants.INT64_MAX
+                if is_end_none
+                else symbolic_helper._parse_arg(end, "i")
+            )
+            dim = symbolic_helper._parse_arg(dim, "i")
+            return symbolic_helper._slice_helper(
+                g, self, axes=[dim], starts=[start], ends=[end]
+            )
+    elif len(args) == 3:
+        # aten::slice(t[] l, int start, int end, int step) -> t[]
+        start, end, step = args
+        dim = 0
+        is_start_none = start.node().kind() == "prim::Constant" and isinstance(
+            start.type(), _C.NoneType
+        )
+        is_end_none = end.node().kind() == "prim::Constant" and isinstance(
+            end.type(), _C.NoneType
+        )
+        start = 0 if is_start_none else symbolic_helper._parse_arg(start, "i")
+        end = (
+            _constants.INT64_MAX
+            if is_end_none
+            else symbolic_helper._parse_arg(end, "i")
+        )
+        return symbolic_helper._slice_helper(
+            g, self, axes=[dim], starts=[start], ends=[end]
+        )
+
+    return symbolic_helper._unimplemented("aten::slice", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::hardtanh")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "f", "f")
+def hardtanh(g: jit_utils.GraphContext, self: _C.Value, min_val: float, max_val: float):
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Clip", self, min_f=min_val, max_f=max_val, opset_before=12
+    )
+
+
+@_onnx_symbolic("aten::hardswish")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v")
+def hardswish(g: jit_utils.GraphContext, self):
+    hs = hardsigmoid(g, self)
+    return g.op("Mul", self, hs)
+
+
+@_onnx_symbolic("aten::hardsigmoid")
+# Fixed scale and zero_point, discovered from aten/src/ATen/native/quantized/cpu/qhardsigmoid.cpp
+@symbolic_helper.quantized_args(True, scale=1.0 / 256.0, zero_point=0)
+@symbolic_helper.parse_args("v")
+def hardsigmoid(g: jit_utils.GraphContext, self):
+    # Set alpha_f to 1 / 6 to make op equivalent to PyTorch's definition of Hardsigmoid.
+    # See https://pytorch.org/docs/stable/generated/torch.nn.Hardsigmoid.html
+    return g.op("HardSigmoid", self, alpha_f=1 / 6)
+
+
+@_onnx_symbolic("aten::tanhshrink")
+@symbolic_helper.parse_args("v")
+def tanhshrink(g: jit_utils.GraphContext, self):
+    return g.op("Sub", self, tanh(g, self))
+
+
+@_onnx_symbolic("aten::hardshrink")
+@symbolic_helper.parse_args("v", "f")
+def hardshrink(g: jit_utils.GraphContext, self, lambd):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    lambd_op = g.op(
+        "Constant",
+        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
+    )
+    cond = logical_or(g, gt(g, self, lambd_op), lt(g, self, neg(g, lambd_op)))
+    return g.op(
+        "Where",
+        cond,
+        self,
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+
+
+@_onnx_symbolic("aten::softshrink")
+@symbolic_helper.parse_args("v", "f")
+def softshrink(g: jit_utils.GraphContext, self, lambd):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    lambd_op = g.op(
+        "Constant",
+        value_t=torch.tensor(lambd, dtype=scalar_type.dtype()),
+    )
+    gt_cond = gt(g, self, lambd_op)
+    gt_out = g.op(
+        "Where",
+        gt_cond,
+        sub(g, self, lambd_op),
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+    lt_cond = lt(g, self, neg(g, lambd_op))
+    lt_out = g.op(
+        "Where",
+        lt_cond,
+        add(g, self, lambd_op),
+        g.op(
+            "Constant",
+            value_t=torch.tensor(0, dtype=scalar_type.dtype()),
+        ),
+    )
+    return add(g, gt_out, lt_out)
+
+
+@_onnx_symbolic("aten::alias")
+def alias(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::unsqueeze")
+@symbolic_helper.parse_args("v", "i")
+def unsqueeze(g: jit_utils.GraphContext, self, dim):
+    """Implement unsqueezing a pytorch tensor in ONNX by inserting a new dimension at the specified `dim`"""
+    # Handle negative dim
+    if dim < 0:
+        rank = symbolic_helper._get_tensor_rank(self)
+        if rank is not None:
+            warnings.warn(
+                "ONNX export unsqueeze with negative axis "
+                + str(dim)
+                + " might cause the onnx model to be incorrect. "
+                + "Negative axis is not supported in ONNX. "
+                + "Axis is converted to "
+                + str(dim + rank + 1)
+                + " based on input shape at export time. "
+                + "Passing an tensor of different rank in execution will be incorrect."
+            )
+            dim = dim + rank + 1
+        else:
+            return symbolic_helper._unimplemented(
+                "unsqueeze", "negative axis with unknown input rank", self
+            )
+
+    return symbolic_helper._unsqueeze_helper(g, self, axes_i=[dim])
+
+
+@_onnx_symbolic("aten::sort")
+# TODO(justinchuby): Support multiple quantized args in output
+@symbolic_helper.parse_args("v", "i", "i", "none")
+def sort(g: jit_utils.GraphContext, self, dim, decending, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "Sort", "Out parameter is not supported for sort", self
+        )
+    self_sizes = symbolic_helper._get_tensor_sizes(self)
+    try:
+        dim_size = self_sizes[dim]
+    except Exception:
+        # FIXME(justinchuby): Avoid catching Exception.
+        # Catch a more specific exception instead.
+        dim_size = None
+
+    if dim_size is None:
+        return symbolic_helper._unimplemented("Sort", "input size not accessible", self)
+
+    return g.op("TopK", self, k_i=dim_size, axis_i=dim, outputs=2)
+
+
+@_onnx_symbolic("aten::numel")
+def numel(g: jit_utils.GraphContext, self):
+    return symbolic_helper._numel_helper(g, self)
+
+
+@_onnx_symbolic("aten::topk")
+# TODO(justinchuby): Support multiple quantized args in output
+@symbolic_helper.parse_args("v", "i", "i", "i", "i", "none")
+def topk(g: jit_utils.GraphContext, self, k, dim, largest, sorted, out=None):
+    if out is not None:
+        symbolic_helper._unimplemented(
+            "TopK", "Out parameter is not supported for topk", self
+        )
+    if not largest:
+        symbolic_helper._unimplemented("TopK", "Ascending TopK is not supported", self)
+
+    return g.op("TopK", self, k_i=k, axis_i=dim, outputs=2)
+
+
+@_onnx_symbolic("prim::convert_element_type")
+def convert_element_type(g: jit_utils.GraphContext, self, *args):
+    dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+    return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+
+@_onnx_symbolic("aten::to")
+def to(g: jit_utils.GraphContext, self, *args):
+    def is_aten_to_device_only(args):
+        if len(args) == 4:
+            # aten::to(Tensor, Device, bool, bool, memory_format)
+            return (
+                args[0].node().kind() == "prim::device"
+                or args[0].type().isSubtypeOf(_C.ListType.ofInts())
+                or isinstance(args[0].type(), _C.DeviceObjType)
+            )
+        elif len(args) == 5:
+            # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
+            # When dtype is None, this is a aten::to(device) call
+            dtype = symbolic_helper._get_const(args[1], "i", "dtype")
+            return dtype is None
+        elif len(args) in (6, 7):
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
+            # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
+            # When dtype is None, this is a aten::to(device) call
+            dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+            return dtype is None
+        return False
+
+    # ONNX doesn't have a concept of a device, so we ignore device-only casts
+    if is_aten_to_device_only(args):
+        return self
+
+    if len(args) == 4:
+        # TestONNXRuntime::test_ones_bool shows args[0] of aten::to() can be onnx::Constant[value=<Tensor>]()
+        # In this case, the constant value is a tensor not int,
+        # so symbolic_helper._maybe_get_const(args[0], 'i') would not work.
+        dtype = args[0]
+        if (
+            symbolic_helper._is_value(args[0])
+            and args[0].node().kind() == "onnx::Constant"
+        ):
+            tval = symbolic_helper._node_get(args[0].node(), "value")
+            if isinstance(tval, torch.Tensor):
+                if len(tval.shape) == 0:
+                    tval = tval.item()
+                    dtype = int(tval)
+                else:
+                    dtype = tval
+
+        if symbolic_helper._is_value(dtype) or isinstance(dtype, torch.Tensor):
+            # aten::to(Tensor, Tensor, bool, bool, memory_format)
+            dtype = _type_utils.JitScalarType.from_value(args[0])
+            return g.op(
+                "Cast",
+                self,
+                to_i=dtype.onnx_type(),
+            )
+        else:
+            # aten::to(Tensor, ScalarType, bool, bool, memory_format)
+            # memory_format is ignored
+            return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 5:
+        # aten::to(Tensor, Device, ScalarType, bool, bool, memory_format)
+        dtype = symbolic_helper._get_const(args[1], "i", "dtype")
+        # memory_format is ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 6:
+        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, memory_format) -> Tensor
+        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+        # Layout, device and memory_format are ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+    elif len(args) == 7:
+        # aten::to(Tensor, ScalarType, Layout, Device, bool, bool, bool, memory_format) -> Tensor
+        dtype = symbolic_helper._get_const(args[0], "i", "dtype")
+        # Layout, device and memory_format are ignored
+        return g.op("Cast", self, to_i=_type_utils.JitScalarType(dtype).onnx_type())
+
+    return symbolic_helper._onnx_unsupported("Unknown aten::to signature", self)
+
+
+@_onnx_symbolic("aten::repeat")
+def repeat(g: jit_utils.GraphContext, self, repeats):
+    dtype = _type_utils.JitScalarType.INT64
+    shape_ = ones_like(g, repeats, dtype)
+    self = g.op("Expand", self, shape_)
+    return g.op("Tile", self, repeats)
+
+
+@_onnx_symbolic("aten::repeat_interleave")
+def repeat_interleave(
+    g: jit_utils.GraphContext, self, repeats, dim=None, output_size=None
+):
+    repeats_dim = symbolic_helper._get_tensor_rank(repeats)
+    repeats_sizes = symbolic_helper._get_tensor_sizes(repeats)
+    input_sizes = symbolic_helper._get_tensor_sizes(self)
+    if repeats_dim is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats rank.",
+            self,
+        )
+    if repeats_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown repeats size.",
+            self,
+        )
+    if input_sizes is None:
+        raise errors.SymbolicValueError(
+            "Unsupported: ONNX export of repeat_interleave for unknown input size.",
+            self,
+        )
+
+    # if dim is None flatten
+    # By default, use the flattened input array, and return a flat output array
+    if symbolic_helper._is_none(dim):
+        self = symbolic_helper._reshape_helper(
+            g, self, g.op("Constant", value_t=torch.tensor([-1]))
+        )
+        dim = torch.tensor(0, dtype=torch.int64)
+    else:
+        dim = symbolic_helper._maybe_get_scalar(dim)
+
+    # Handle cases where dim is negative
+    if dim < 0:
+        dim += len(input_sizes)
+
+    input_sizes_temp = input_sizes.copy()
+    for idx, input_size in enumerate(input_sizes):
+        if input_size is None:
+            input_sizes[idx], input_sizes_temp[idx] = 0, -1
+
+    # Cases where repeats is an int or single value tensor
+    if repeats_dim == 0 or (repeats_dim == 1 and repeats_sizes[0] == 1):
+        if input_sizes[dim] == 0:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+                self,
+            )
+        return symbolic_helper._repeat_interleave_single_value_repeat_helper(
+            g, self, repeats, dim
+        )
+
+    # Cases where repeats is a 1 dim Tensor
+    elif repeats_dim == 1:
+        if input_sizes[dim] == 0:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported along dimension with unknown input size",
+                self,
+            )
+        if repeats_sizes[0] is None:
+            return symbolic_helper._onnx_opset_unsupported_detailed(
+                "repeat_interleave",
+                9,
+                13,
+                "Unsupported for cases with dynamic repeats",
+                self,
+            )
+        assert (
+            repeats_sizes[0] == input_sizes[dim]
+        ), "repeats must have the same size as input along dim"
+        reps = repeats_sizes[0]
+    else:
+        raise errors.SymbolicValueError("repeats must be 0-dim or 1-dim tensor", self)
+
+    final_splits = []
+    r_splits = symbolic_helper._repeat_interleave_split_helper(g, repeats, reps, 0)
+    i_splits = symbolic_helper._repeat_interleave_split_helper(g, self, reps, dim)
+    input_sizes[dim], input_sizes_temp[dim] = -1, 1
+    for idx, r_split in enumerate(r_splits):
+        i_split = unsqueeze(g, i_splits[idx], dim + 1)
+        r_concat = [
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[: dim + 1])),
+            r_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes_temp[dim + 1 :])),
+        ]
+        r_concat = g.op("Concat", *r_concat, axis_i=0)
+        i_split = expand(g, i_split, r_concat, None)
+        i_split = symbolic_helper._reshape_helper(
+            g,
+            i_split,
+            g.op("Constant", value_t=torch.LongTensor(input_sizes)),
+            allowzero=0,
+        )
+        final_splits.append(i_split)
+    return g.op("Concat", *final_splits, axis_i=dim)
+
+
+@_onnx_symbolic("aten::pixel_shuffle")
+@symbolic_helper.parse_args("v", "i")
+def pixel_shuffle(g: jit_utils.GraphContext, self, upscale_factor):
+    dims = symbolic_helper._get_tensor_sizes(self)
+    if len(dims) != 4:
+        return symbolic_helper._unimplemented(
+            "pixel_shuffle", "only support 4d input", self
+        )
+    if any(i is None for i in dims[1:]):
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            symbolic_helper._unsqueeze_helper(g, self, [2, 3]),
+            g.op(
+                "Constant",
+                value_t=torch.tensor([0, -1, upscale_factor, upscale_factor, 0, 0]),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
+        # For dynamic input shapes, two reshapes are performed
+        reshape_h = symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op("Constant", value_t=torch.tensor([0, 0, -1, 1, 0, 0])),
+            allowzero=0,
+        )
+        reshape_w = symbolic_helper._reshape_helper(
+            g,
+            reshape_h,
+            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, 1])),
+            allowzero=0,
+        )
+        return symbolic_helper._squeeze_helper(g, reshape_w, [3, 5])
+    else:
+        output_channel = dims[1] // upscale_factor // upscale_factor
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            self,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        upscale_factor,
+                        upscale_factor,
+                        dims[2],
+                        dims[3],
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 4, 2, 5, 3])
+        return symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        dims[2] * upscale_factor,
+                        dims[3] * upscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+
+
+@_onnx_symbolic("aten::pixel_unshuffle")
+@symbolic_helper.parse_args("v", "i")
+def pixel_unshuffle(g: jit_utils.GraphContext, self, downscale_factor):
+    dims = symbolic_helper._get_tensor_sizes(self)
+    if len(dims) != 4:
+        return symbolic_helper._unimplemented(
+            "pixel_shuffle", "only support 4d input", self
+        )
+    if any(i is None for i in dims[1:]):
+        # For dynamic input shapes, two reshapes are performed
+        reshape_h = symbolic_helper._reshape_helper(
+            g,
+            symbolic_helper._unsqueeze_helper(g, self, [3]),
+            g.op("Constant", value_t=torch.tensor([0, 0, -1, downscale_factor, 0])),
+            allowzero=0,
+        )
+        reshape_w = symbolic_helper._reshape_helper(
+            g,
+            reshape_h,
+            g.op("Constant", value_t=torch.tensor([0, 0, 0, 0, -1, downscale_factor])),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", reshape_w, perm_i=[0, 1, 3, 5, 2, 4])
+        final_reshape = symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op("Constant", value_t=torch.tensor([0, -1, 1, 1, 0, 0])),
+            allowzero=0,
+        )
+        return symbolic_helper._squeeze_helper(g, final_reshape, [2, 3])
+    else:
+        output_channel = dims[1] * downscale_factor * downscale_factor
+        after_view = symbolic_helper._reshape_helper(
+            g,
+            self,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        dims[1],
+                        dims[2] // downscale_factor,
+                        downscale_factor,
+                        dims[3] // downscale_factor,
+                        downscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+        after_transpose = g.op("Transpose", after_view, perm_i=[0, 1, 3, 5, 2, 4])
+        return symbolic_helper._reshape_helper(
+            g,
+            after_transpose,
+            g.op(
+                "Constant",
+                value_t=torch.tensor(
+                    [
+                        -1,
+                        output_channel,
+                        dims[2] // downscale_factor,
+                        dims[3] // downscale_factor,
+                    ]
+                ),
+            ),
+            allowzero=0,
+        )
+
+
+def _generic_rnn(
+    g: jit_utils.GraphContext,
+    variant,
+    input,
+    initial_states,
+    all_weights,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first=None,
+    batch_sizes=None,
+):
+    warnings.warn(
+        "Exporting a model to ONNX with a batch_size other than 1, "
+        + "with a variable length with "
+        + variant
+        + " can cause an error "
+        + "when running the ONNX model with a different batch size. "
+        + "Make sure to save the model with a batch size of 1, "
+        + "or define the initial states (h0/c0) as inputs of the model. "
+    )
+
+    onnxActivations = [
+        "Relu",
+        "Tanh",
+        "Sigmoid",
+        "Affine",
+        "LeakyRelu",
+        "ThresholdedRelu",
+        "ScaledTanh",
+        "HardSigmoid",
+        "Elu",
+        "Softsign",
+        "Softplus",
+    ]
+    variantToOnnxActivationMap = dict(
+        zip([act_fun.lower() for act_fun in onnxActivations], onnxActivations)
+    )
+    weights_per_layer = 4 if has_biases else 2
+    # this means that projections are used inside LSTM, so need to tell user that it's not supported
+    if variant == "LSTM" and len(all_weights) != num_layers * weights_per_layer * (
+        1 + bidirectional
+    ):
+        return symbolic_helper._unimplemented("LSTM", "LSTMs with projections", input)
+    assert len(all_weights) == num_layers * weights_per_layer * (1 + bidirectional)
+    layer_weights = [
+        all_weights[i : i + weights_per_layer]
+        for i in range(0, len(all_weights), weights_per_layer)
+    ]
+    if batch_first:
+        # batch, seq, feat -> seq, batch, feat
+        input = g.op("Transpose", input, perm_i=[1, 0, 2])
+    if dropout and train:
+        return symbolic_helper._unimplemented(
+            "RNN/GRU/LSTM", "dropout in training mode", input
+        )
+
+    if variant.startswith("RNN"):
+        nonlinearity = variantToOnnxActivationMap[variant[4:].lower()]
+        variant = "RNN"
+
+    w_hh = all_weights[1]
+    hidden_size = symbolic_helper._get_tensor_dim_size(w_hh, 1)
+    if hidden_size is None:
+        return symbolic_helper._unimplemented(
+            "RNN/GRU/LSTM", "unknown hidden size", input
+        )
+
+    unidirectional = not bidirectional
+
+    prev_output = input
+
+    h_outs = []
+    if variant == "RNN" or variant == "GRU":
+        h0 = initial_states
+    elif variant == "LSTM":
+        h0, c0 = initial_states
+        c_outs = []
+
+    sequence_lens = unused(g) if batch_sizes is None else batch_sizes
+
+    if variant == "GRU":
+        # pytorch is reset, input, hidden
+        # onnx is    input, reset, hidden
+        reform_permutation = [(1, 2), (0, 1), (2, 3)]
+    elif variant == "LSTM":
+        # pytorch is input, forget, cell, output.
+        # onnx is    input, output, forget, cell.
+        reform_permutation = [(0, 1), (3, 4), (1, 3)]
+
+    def reform_weights(g, w, n, intervals):
+        slices = [
+            symbolic_helper._slice_helper(g, w, axes=[0], starts=[x * n], ends=[y * n])
+            for x, y in intervals
+        ]
+        return g.op("Concat", *slices, axis_i=0)
+
+    def transform_weights_no_bias(layer_index):
+        weights = layer_weights[layer_index]
+        if variant == "RNN":
+            weight_ih, weight_hh = weights
+        elif variant == "GRU" or variant == "LSTM":
+            weight_ih, weight_hh = (
+                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
+            )
+        return tuple(
+            symbolic_helper._unsqueeze_helper(g, x, [0])
+            for x in (weight_ih, weight_hh)  # type: ignore[possibly-undefined]
+        )
+
+    def transform_weights(layer_index):
+        weights = layer_weights[layer_index]
+        if variant == "RNN":
+            weight_ih, weight_hh, bias_ih, bias_hh = weights
+        elif variant == "GRU" or variant == "LSTM":
+            weight_ih, weight_hh, bias_ih, bias_hh = (
+                reform_weights(g, w, hidden_size, reform_permutation) for w in weights
+            )
+        bias_concat = g.op("Concat", bias_ih, bias_hh, axis_i=0)  # type: ignore[possibly-undefined]
+        return tuple(
+            symbolic_helper._unsqueeze_helper(g, x, [0])
+            for x in (weight_ih, weight_hh, bias_concat)  # type: ignore[possibly-undefined]
+        )
+
+    def retrieve_state(x, start, end):
+        return (
+            x
+            if num_layers == 1
+            else symbolic_helper._slice_helper(
+                g, x, axes=[0], starts=[start], ends=[end]
+            )
+        )
+
+    for i in range(num_layers):
+        if unidirectional:
+            if weights_per_layer == 4:
+                weight_ih, weight_hh, bias_concat = transform_weights(i)
+            else:
+                weight_ih, weight_hh = transform_weights_no_bias(i)
+                bias_concat = unused(g)
+
+            state_indices = i, i + 1
+        else:
+            if weights_per_layer == 4:
+                weight_ih_f, weight_hh_f, bias_f = transform_weights(2 * i)
+                weight_ih_b, weight_hh_b, bias_b = transform_weights(2 * i + 1)
+                bias_concat = g.op("Concat", bias_f, bias_b, axis_i=0)
+            else:
+                weight_ih_f, weight_hh_f = transform_weights_no_bias(2 * i)
+                weight_ih_b, weight_hh_b = transform_weights_no_bias(2 * i + 1)
+                bias_concat = unused(g)
+
+            weight_ih = g.op("Concat", weight_ih_f, weight_ih_b, axis_i=0)
+            weight_hh = g.op("Concat", weight_hh_f, weight_hh_b, axis_i=0)
+
+            state_indices = 2 * i, 2 * i + 2
+
+        inputs = [prev_output, weight_ih, weight_hh, bias_concat, sequence_lens]
+
+        inputs.append(retrieve_state(h0, *state_indices))  # type: ignore[possibly-undefined]
+        if variant == "LSTM":
+            inputs.append(retrieve_state(c0, *state_indices))  # type: ignore[possibly-undefined]
+
+        extra_kwargs = {} if unidirectional else {"direction_s": "bidirectional"}
+        if variant == "RNN":
+            if bidirectional:
+                activation = [nonlinearity, nonlinearity]  # type: ignore[possibly-undefined]
+            else:
+                activation = [nonlinearity]  # type: ignore[possibly-undefined]
+
+            prev_output, h_out = g.op(
+                "RNN",
+                *inputs,
+                outputs=2,
+                hidden_size_i=hidden_size,
+                activations_s=activation,
+                **extra_kwargs,
+            )
+        elif variant == "GRU":
+            prev_output, h_out = g.op(
+                "GRU",
+                *inputs,
+                outputs=2,
+                hidden_size_i=hidden_size,
+                linear_before_reset_i=1,
+                **extra_kwargs,
+            )
+        elif variant == "LSTM":
+            prev_output, h_out, c_out = g.op(
+                "LSTM", *inputs, outputs=3, hidden_size_i=hidden_size, **extra_kwargs
+            )
+
+        if bidirectional:
+            # The ONNX RNN/GRU/LSTM produce an output of dimensions
+            #   seq_len, num_directions, batch, hidden_size
+            # We have to convert to match pytorch's expected
+            #   seq_len, batch, num_directions * hidden_size
+            # by first moving num_directions before hidden_size with
+            # Transpose, and then combining it with hidden_size
+            # with Reshape.
+            prev_output = g.op("Transpose", prev_output, perm_i=[0, 2, 1, 3])
+            prev_output = symbolic_helper._reshape_helper(
+                g,
+                prev_output,
+                g.op("Constant", value_t=torch.LongTensor([0, 0, -1])),
+                allowzero=0,
+            )
+        else:
+            prev_output = symbolic_helper._squeeze_helper(g, prev_output, [1])
+
+        h_outs.append(h_out)  # type: ignore[possibly-undefined]
+        if variant == "LSTM":
+            c_outs.append(c_out)  # type: ignore[possibly-undefined]
+    if batch_first:
+        # seq, batch, num_directions * hidden_size -> batch, seq, num_directions * hidden_size
+        prev_output = g.op("Transpose", prev_output, perm_i=[1, 0, 2])
+    h_outs = h_out if num_layers == 1 else g.op("Concat", *h_outs, axis_i=0)  # type: ignore[possibly-undefined]
+    if variant == "RNN" or variant == "GRU":
+        return prev_output, h_outs
+    elif variant == "LSTM":
+        c_outs = c_out if num_layers == 1 else g.op("Concat", *c_outs, axis_i=0)  # type: ignore[possibly-undefined]
+        return prev_output, h_outs, c_outs
+
+
+@symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
+def _lstm_full(
+    g: jit_utils.GraphContext,
+    input,
+    hidden_v,
+    weight_v,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+    batch_first,
+):
+    hidden, weight = (
+        symbolic_helper._unpack_list(hidden_v),
+        symbolic_helper._unpack_list(weight_v),
+    )
+    return _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+    )
+
+
+@symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
+def _lstm_packed(
+    g: jit_utils.GraphContext,
+    input,
+    batch_sizes,
+    hidden_v,
+    weight_v,
+    has_biases,
+    num_layers,
+    dropout,
+    train,
+    bidirectional,
+):
+    hidden, weight = (
+        symbolic_helper._unpack_list(hidden_v),
+        symbolic_helper._unpack_list(weight_v),
+    )
+    return _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_sizes=batch_sizes,
+    )
+
+
+@_onnx_symbolic("aten::lstm")
+def lstm(g: jit_utils.GraphContext, *args):
+    if symbolic_helper._is_tensor_list(args[3]):
+        return _lstm_packed(g, *args)
+    else:
+        return _lstm_full(g, *args)
+
+
+@_onnx_symbolic("aten::lstm_cell")
+def lstm_cell(g: jit_utils.GraphContext, self, hidden, w_ih, w_hh, b_ih, b_hh):
+    input = symbolic_helper._unsqueeze_helper(g, self, [0])
+    hidden = symbolic_helper._unpack_list(hidden)
+    hidden = [symbolic_helper._unsqueeze_helper(g, x, [0]) for x in hidden]
+    weight = (
+        (w_ih, w_hh, b_ih, b_hh) if symbolic_helper._is_tensor(b_ih) else (w_ih, w_hh)
+    )
+    has_biases = True if symbolic_helper._is_tensor(b_ih) else False
+    _, h_outs, c_outs = _generic_rnn(
+        g,
+        "LSTM",
+        input,
+        hidden,
+        weight,
+        has_biases,
+        num_layers=1,
+        dropout=0,
+        train=0,
+        bidirectional=False,
+        batch_first=False,
+    )
+    return symbolic_helper._squeeze_helper(
+        g, h_outs, [0]
+    ), symbolic_helper._squeeze_helper(g, c_outs, [0])
+
+
+@_onnx_symbolic(
+    "aten::gru", decorate=[symbolic_helper._apply_params("GRU"), _export("gru")]
+)
+@_onnx_symbolic(
+    "aten::rnn_tanh",
+    decorate=[symbolic_helper._apply_params("RNN_TANH"), _export("rnn_tanh")],
+)
+@_onnx_symbolic(
+    "aten::rnn_relu",
+    decorate=[symbolic_helper._apply_params("RNN_RELU"), _export("rnn_relu")],
+)
+def _one_hidden_rnn(kind: str):
+    @symbolic_helper.parse_args("v", "v", "v", "i", "i", "f", "i", "i", "i")
+    def _rnn_full(
+        g,
+        input,
+        hidden,
+        weight_v,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+        batch_first,
+    ):
+        weight = symbolic_helper._unpack_list(weight_v)
+        return _generic_rnn(
+            g,
+            kind,
+            input,
+            hidden,
+            weight,
+            has_biases,
+            num_layers,
+            dropout,
+            train,
+            bidirectional,
+            batch_first,
+        )
+
+    @symbolic_helper.parse_args("v", "v", "v", "v", "i", "i", "f", "i", "i")
+    def _rnn_packed(
+        g,
+        input,
+        batch_sizes,
+        hidden,
+        weight_v,
+        has_biases,
+        num_layers,
+        dropout,
+        train,
+        bidirectional,
+    ):
+        weight = symbolic_helper._unpack_list(weight_v)
+        return _generic_rnn(
+            g,
+            kind,
+            input,
+            hidden,
+            weight,
+            has_biases,
+            num_layers,
+            dropout,
+            train,
+            bidirectional,
+            batch_sizes=batch_sizes,
+        )
+
+    def symbolic(g, *args):
+        if symbolic_helper._is_tensor_list(args[3]):
+            return _rnn_packed(g, *args)
+        else:
+            return _rnn_full(g, *args)
+
+    return symbolic
+
+
+@_onnx_symbolic("aten::_dim_arange")
+@symbolic_helper.parse_args("v", "i")
+def _dim_arange(g: jit_utils.GraphContext, like, dim):
+    like_shape = g.op("Shape", like)
+    stop = g.op(
+        "Gather", like_shape, g.op("Constant", value_t=torch.tensor(dim)), axis_i=0
+    )
+    # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+    return arange(g, stop, 4, None, None, None)
+
+
+@_onnx_symbolic("aten::detach")
+def detach(g: jit_utils.GraphContext, input):
+    # Erase aten::detach nodes because ONNX is inference only
+    return input
+
+
+@_onnx_symbolic("aten::contiguous")
+@symbolic_helper.parse_args("v", "i")
+def contiguous(g: jit_utils.GraphContext, input, memory_format):
+    if memory_format > 2:  # allower values are any, preserve and contiguous_format
+        raise errors.SymbolicValueError(
+            "onnx memory_format support is not implemented", input
+        )
+    return input
+
+
+@_onnx_symbolic("aten::_pack_padded_sequence")
+@symbolic_helper.parse_args("v", "v", "i")
+def _pack_padded_sequence(g: jit_utils.GraphContext, input, lengths, batch_first):
+    # Currently there is no PackPadded operator in ONNX. We rely on an
+    # optimization pass to remove this later. It is an error if all
+    # PackPadded operators cannot be optimized out.
+    if batch_first:
+        input = g.op("Transpose", input, perm_i=[1, 0, 2])
+    if not lengths.type().isSubtypeOf(torch._C.TensorType.get()):
+        raise errors.SymbolicValueError(
+            "'lengths' must be a Tensor for ONNX export", input
+        )
+    # We know it's a TensorType so this check is now safe.
+    # It's really only necessary because those operators expand to something that
+    # only works with int32 types in Caffe2...
+    if (
+        _type_utils.JitScalarType.from_value(
+            lengths, _type_utils.JitScalarType.UNDEFINED
+        )
+        != _type_utils.JitScalarType.INT
+    ):
+        lengths = g.op("Cast", lengths, to_i=_C_onnx.TensorProtoDataType.INT32)
+    return g.op("prim::PackPadded", input, lengths, outputs=2)
+
+
+@_onnx_symbolic("aten::_pad_packed_sequence")
+@symbolic_helper.parse_args("v", "v", "i", "t", "v")
+def _pad_packed_sequence(
+    g: jit_utils.GraphContext,
+    data,
+    batch_sizes,
+    batch_first,
+    padding_value,
+    total_length,
+):
+    # Ignore total_length as it is not supported in _symbolic_pad_packed_sequence
+    # It is only useful/used when training using data_parallel model, so
+    # It shouldn't be relevant for ONNX anyway
+    data, lengths = g.op("prim::PadPacked", data, batch_sizes, outputs=2)
+    if batch_first:
+        data = g.op("Transpose", data, perm_i=[1, 0, 2])
+    return data, lengths
+
+
+@_onnx_symbolic("aten::randint")
+def randint(g: jit_utils.GraphContext, low, high, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    low_i = symbolic_helper._get_const(low, "i", "low")
+    high_i = symbolic_helper._get_const(high, "i", "high")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.INT64
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if low_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", low)
+    if high_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", high)
+
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        randn = g.op(
+            "RandomUniformLike",
+            shape_const,
+            low_f=low_i,
+            high_f=high_i,
+        )
+    else:
+        randn = g.op(
+            "RandomUniform",
+            shape_i=shape,
+            low_f=low_i,
+            high_f=high_i,
+        )
+
+    # cast to integer type
+    int_dtype = _type_utils.JitScalarType.INT64
+    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
+    if int_dtype != scalar_type:
+        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
+    return randint
+
+
+@_onnx_symbolic("aten::randint_like")
+def randint_like(g: jit_utils.GraphContext, self, low, high, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    low_i = symbolic_helper._get_const(low, "i", "low")
+    high_i = symbolic_helper._get_const(high, "i", "high")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.INT64
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    if low_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", low)
+    if high_i is None:
+        raise symbolic_helper._onnx_unsupported("randint", high)
+
+    randn = g.op(
+        "RandomUniformLike",
+        self,
+        low_f=low_i,
+        high_f=high_i,
+    )
+
+    # cast to integer type
+    int_dtype = _type_utils.JitScalarType.INT64
+    randint = g.op("Cast", randn, to_i=int_dtype.onnx_type())
+    if int_dtype != scalar_type:
+        randint = g.op("Cast", randint, to_i=scalar_type.onnx_type())
+    return randint
+
+
+@_onnx_symbolic("aten::randn")
+def randn(g: jit_utils.GraphContext, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        return g.op(
+            "RandomNormalLike",
+            shape_const,
+            dtype_i=scalar_type.onnx_type(),
+        )
+    return g.op(
+        "RandomNormal",
+        shape_i=shape,
+        dtype_i=scalar_type.onnx_type(),
+    )
+
+
+@_onnx_symbolic("aten::rand")
+def rand(g: jit_utils.GraphContext, shapes, dtype, *options):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    shape = symbolic_helper._maybe_get_const(shapes, "is")
+    if symbolic_helper._is_value(shape):
+        shape_const = g.op(
+            "ConstantOfShape",
+            shapes,
+            value_t=torch.tensor([0], dtype=torch.float),
+        )
+        return g.op(
+            "RandomUniformLike",
+            shape_const,
+            dtype_i=scalar_type.onnx_type(),
+        )
+    return g.op(
+        "RandomUniform",
+        shape_i=shape,
+        dtype_i=scalar_type.onnx_type(),
+    )
+
+
+@_onnx_symbolic("aten::randn_like")
+def randn_like(
+    g: jit_utils.GraphContext,
+    self,
+    dtype,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        scalar_type = _type_utils.JitScalarType.from_value(
+            self, _type_utils.JitScalarType.FLOAT
+        )
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+    return g.op("RandomNormalLike", self, dtype_i=scalar_type.onnx_type())
+
+
+@_onnx_symbolic("aten::rand_like")
+def rand_like(
+    g: jit_utils.GraphContext,
+    self,
+    dtype,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=None,
+):
+    dtype = symbolic_helper._get_const(dtype, "i", "dtype")
+    if dtype is None:
+        dtype = _type_utils.JitScalarType.from_value(
+            self, _type_utils.JitScalarType.FLOAT
+        )
+    return g.op(
+        "RandomUniformLike", self, dtype_i=_type_utils.JitScalarType(dtype).onnx_type()
+    )
+
+
+@_onnx_symbolic("aten::rrelu")
+@symbolic_helper.parse_args("v", "f", "f", "i", "none")
+def rrelu(g: jit_utils.GraphContext, input, lower, upper, training, generator):
+    if not training:
+        slope = (upper + lower) / 2.0
+        return g.op("LeakyRelu", input, alpha_f=slope)
+    p = g.op("RandomUniformLike", input, high_f=upper, low_f=lower)
+    return g.op("PRelu", input, p)
+
+
+@_onnx_symbolic("aten::bernoulli")
+def bernoulli(g: jit_utils.GraphContext, input, p=None, generator=None, out=None):
+    if out is not None and not symbolic_helper._is_none(out):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "out parameter is not supported for bernoulli", input
+        )
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Bernoulli", "generator is not supported for bernoulli", input
+        )
+
+    dtype = _type_utils.JitScalarType.from_value(
+        input, _type_utils.JitScalarType.UNDEFINED
+    )
+    if dtype == _type_utils.JitScalarType.UNDEFINED:
+        return symbolic_helper._unimplemented(
+            "Bernoulli", "input dtype not accessible", input
+        )
+
+    rands = g.op(
+        "RandomUniformLike",
+        input,
+        high_f=1.0,
+        low_f=0.0,
+        dtype_i=dtype.onnx_type(),
+    )
+    prob = p if p is not None and not symbolic_helper._is_none(p) else input
+    output = g.op("Less", rands, prob)
+    return g.op("Cast", output, to_i=dtype.onnx_type())
+
+
+@_onnx_symbolic("aten::log_sigmoid")
+@symbolic_helper.parse_args("v")
+def log_sigmoid(g: jit_utils.GraphContext, input):
+    p = g.op("Sigmoid", input)
+    return g.op("Log", p)
+
+
+@_onnx_symbolic("aten::erf")
+@symbolic_helper.parse_args("v")
+def erf(g: jit_utils.GraphContext, input):
+    return g.op("Erf", input)
+
+
+@_onnx_symbolic("aten::flatten")
+@symbolic_helper.quantized_args(True, False, False)
+@symbolic_helper.parse_args("v", "i", "i")
+def flatten(g: jit_utils.GraphContext, input, start_dim, end_dim):
+    dim = symbolic_helper._get_tensor_rank(input)
+    if dim is None:
+        return symbolic_helper._unimplemented(
+            "dim",
+            "ONNX and PyTorch use different strategies to split the input. "
+            "Input rank must be known at export time.",
+            input,
+        )
+
+    if dim == 0:
+        return symbolic_helper._reshape_helper(g, input, [1])
+    if dim == 1:
+        return g.op("Identity", input)
+    # TODO: remove this as onnx opset 11 spec allows negative axes
+    if end_dim < 0:
+        end_dim = dim + end_dim
+    # use ONNX's Flatten operator for cases where the output shape is 2D
+    if start_dim == 1 and end_dim == dim - 1:
+        return g.op("Flatten", input, axis_i=start_dim)
+    if start_dim == 0 and end_dim == dim - 2:
+        return g.op("Flatten", input, axis_i=end_dim + 1)
+
+    return symbolic_helper._flatten_helper(g, input, start_dim, end_dim, dim)
+
+
+@_onnx_symbolic("aten::nonzero")
+@symbolic_helper.parse_args("v")
+def nonzero(g: jit_utils.GraphContext, input):
+    """Emitted from `torch.nonzero(x, as_tuple=False)`"""
+    return t(g, g.op("NonZero", input))
+
+
+@_onnx_symbolic("aten::nonzero_numpy")
+# Emitted from `torch.nonzero(x, as_tuple=True)`
+def nonzero_numpy(g: jit_utils.GraphContext, input, _outputs=None):
+    return unbind(g, nonzero(g, input), 1, _outputs=_outputs)
+
+
+@_onnx_symbolic("aten::isnan")
+@symbolic_helper.parse_args("v")
+def isnan(g: jit_utils.GraphContext, input):
+    output = g.op("IsNaN", input)
+    return output
+
+
+@_onnx_symbolic("aten::any")
+def _any(g: jit_utils.GraphContext, *args):
+    # aten::any(Tensor self)
+    if len(args) == 1:
+        input = args[0]
+        dim, keepdim = None, 0
+    # aten::any(Tensor self, int[]? dim, bool keepdim)
+    else:
+        input, dim, keepdim = args
+        # Can be int list or single int
+        dim = symbolic_helper._parse_arg(dim, "t")
+        dim = [int(d) for d in dim.view(-1)]
+        keepdim = symbolic_helper._parse_arg(keepdim, "i")
+    input = g.op("Cast", input, to_i=_C_onnx.TensorProtoDataType.INT64)
+    input_sum = symbolic_helper._reducesum_helper(
+        g, input, axes_i=dim, keepdims_i=keepdim
+    )
+    return gt(g, input_sum, g.op("Constant", value_t=torch.tensor(0, dtype=torch.long)))
+
+
+@_onnx_symbolic("aten::all")
+def _all(g: jit_utils.GraphContext, *args):
+    input = g.op("Not", args[0])
+    # aten::all(Tensor self)
+    if len(args) == 1:
+        return g.op("Not", _any(g, input))
+    # aten::all(Tensor self, int[]? dim, bool keepdim)
+    else:
+        return g.op("Not", _any(g, input, args[1], args[2]))
+
+
+@_onnx_symbolic("aten::narrow")
+@symbolic_helper.parse_args("v", "i", "i", "i")
+def narrow(g: jit_utils.GraphContext, input, dim, start, length):
+    return symbolic_helper._slice_helper(
+        g, input, axes=[dim], starts=[start], ends=[start + length]
+    )
+
+
+@_onnx_symbolic("aten::argmax")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmax(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMax")
+
+
+@_onnx_symbolic("aten::argmin")
+@symbolic_helper.parse_args("v", "v", "b")
+def argmin(
+    g: jit_utils.GraphContext,
+    input: torch._C.Value,
+    dim: torch._C.Value,
+    keepdim: bool,
+):
+    return symbolic_helper._argmin_argmax_helper(g, input, dim, keepdim, "ArgMin")
+
+
+@_onnx_symbolic("aten::scatter")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter(g: jit_utils.GraphContext, self, dim, index, src):
+    src_type = _type_utils.JitScalarType.from_value(
+        src, _type_utils.JitScalarType.UNDEFINED
+    )
+    src = symbolic_helper._maybe_get_scalar(src)
+    if symbolic_helper._is_value(src):
+        return g.op("Scatter", self, index, src, axis_i=dim)
+    else:
+        # Check if scalar "src" has same type as self (PyTorch allows different
+        # type for scalar src (but not when src is tensor)). If not, insert Cast node.
+        self_scalar_type = _type_utils.JitScalarType.from_value(self)
+        if self_scalar_type != src_type:
+            src = g.op("Cast", src, to_i=self_scalar_type.onnx_type())
+        return g.op("Scatter", self, index, expand_as(g, src, index), axis_i=dim)
+
+
+@_onnx_symbolic("aten::scatter_add")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def scatter_add(g: jit_utils.GraphContext, self, dim, index, src):
+    scalar_type = symbolic_helper._try_get_scalar_type(self)
+    if scalar_type is None:
+        return symbolic_helper._unimplemented(
+            "scatter_add", "input dtype not accessible", self
+        )
+    sizes = symbolic_helper._get_tensor_sizes(self, allow_nonstatic=False)
+    if sizes:
+        to_add = g.op("Constant", value_t=torch.zeros(sizes, dtype=scalar_type.dtype()))
+    else:
+        to_add = zeros_like(g, self, scalar_type)
+    to_add = symbolic_helper._scatter_helper(g, to_add, dim, index, src)
+    return add(g, self, to_add)
+
+
+@_onnx_symbolic("aten::log2")
+def log2(g: jit_utils.GraphContext, self):
+    _ln2 = 0.693147180559945309
+    return g.op("Div", log(g, self), g.op("Constant", value_t=torch.tensor(_ln2)))
+
+
+@_onnx_symbolic("aten::is_floating_point")
+def is_floating_point(g: jit_utils.GraphContext, self):
+    if symbolic_helper._is_fp(self):
+        return g.op("Constant", value_t=torch.BoolTensor([1]))
+    return g.op("Constant", value_t=torch.BoolTensor([0]))
+
+
+@_onnx_symbolic("aten::__is_")
+def __is_(g: jit_utils.GraphContext, self, other):
+    if symbolic_helper._is_none(other):
+        if symbolic_helper._is_none(self):
+            return g.op("Constant", value_t=torch.BoolTensor([1]))
+        return g.op("Constant", value_t=torch.BoolTensor([0]))
+    return eq(g, self, other)
+
+
+@_onnx_symbolic("aten::__isnot_")
+@wrap_logical_op_with_negation
+def __isnot_(g: jit_utils.GraphContext, self, other):
+    return __is_(g, self, other)
+
+
+@_onnx_symbolic("aten::one_hot")
+def one_hot(g: jit_utils.GraphContext, self, num_classes):
+    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
+    # onnxruntime supports limited type combinations for OneHot.
+    if _type_utils.JitScalarType.from_value(
+        num_classes, _type_utils.JitScalarType.UNDEFINED
+    ) in {
+        _type_utils.JitScalarType.UINT8,
+        _type_utils.JitScalarType.INT8,
+        _type_utils.JitScalarType.INT,
+        _type_utils.JitScalarType.INT16,
+    }:
+        num_classes = g.op("Cast", num_classes, to_i=_C_onnx.TensorProtoDataType.INT64)
+    return g.op("OneHot", self, num_classes, values, axis_i=-1)
+
+
+@_onnx_symbolic("aten::gather")
+@symbolic_helper.parse_args("v", "i", "v", "v")
+def gather(g: jit_utils.GraphContext, self, dim, index, sparse_grad=False):
+    if symbolic_helper._maybe_get_const(sparse_grad, "i"):
+        return symbolic_helper._unimplemented("gather", "sparse_grad == True", self)
+    # NOTE: This workaround is needed since GatherElement is only supported
+    #       since opset 11, and Gather in ONNX is not the same as torch.gather.
+    scalar_type = _type_utils.JitScalarType.from_value(self)
+    values = g.op("Constant", value_t=torch.LongTensor([0, 1]))
+    depth = size(g, self, g.op("Constant", value_t=torch.LongTensor([dim])))
+    index = g.op(
+        "Cast",
+        g.op("OneHot", index, depth, values, axis_i=dim),
+        to_i=scalar_type.onnx_type(),
+    )
+    mul = g.op("Mul", symbolic_helper._unsqueeze_helper(g, self, [dim + 1]), index)
+    return symbolic_helper._reducesum_helper(g, mul, axes_i=[dim], keepdims_i=0)
+
+
+@symbolic_helper.parse_args("v", "is", "i", "i")
+def _var_mean(g: jit_utils.GraphContext, input, dim, correction, keepdim):
+    return symbolic_helper._var_mean_helper(g, input, dim, correction, keepdim)
+
+
+@_onnx_symbolic("aten::std")
+def std(g: jit_utils.GraphContext, input, *args):
+    var, _ = var_mean(g, input, *args)
+    return g.op("Sqrt", var)
+
+
+@_onnx_symbolic("aten::var")
+def var(g: jit_utils.GraphContext, input, *args):
+    var, _ = var_mean(g, input, *args)
+    return var
+
+
+@_onnx_symbolic("aten::var_mean")
+def var_mean(g: jit_utils.GraphContext, input, *args):
+    if len(args) == 1:
+        return _var_mean(g, input, None, args[0], None)
+    else:
+        return _var_mean(g, input, *args)
+
+
+@_onnx_symbolic("aten::std_mean")
+def std_mean(g: jit_utils.GraphContext, input, *args):
+    var, mean = var_mean(g, input, *args)
+    return g.op("Sqrt", var), mean
+
+
+@_onnx_symbolic("aten::logsumexp")
+@symbolic_helper.parse_args("v", "is", "i")
+def logsumexp(g: jit_utils.GraphContext, input, dim, keepdim):
+    return g.op("ReduceLogSumExp", input, axes_i=dim, keepdims_i=keepdim)
+
+
+@_onnx_symbolic("aten::arange")
+def arange(g: jit_utils.GraphContext, *args):
+    def _get_arange_dtype(dtype):
+        dtype = symbolic_helper._maybe_get_const(dtype, "i")
+        return dtype
+
+    def _float_step_convert(range_tensor):
+        if symbolic_helper._is_fp(range_tensor):
+            range_tensor = g.op(
+                "Cast",
+                g.op("Ceil", range_tensor),
+                to_i=_type_utils.JitScalarType.INT64.onnx_type(),
+            )
+        return range_tensor
+
+    if len(args) == 2 or len(args) == 5:
+        if len(args) == 2:
+            # aten::arange(Scalar end, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[1])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, end=args[0], dtype=dtype
+        )
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        range_tensor = _float_step_convert(end)
+        arange_tensor = symbolic_helper._squeeze_helper(
+            g, nonzero(g, ones(g, range_tensor, dtype, None, None)), [1]
+        )
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+    elif len(args) == 4 or len(args) == 7:
+        if len(args) == 4:
+            # aten::arange(Scalar start, Scalar end, Scalar step, Tensor out)
+            dtype = None
+        else:
+            # aten::arange(Scalar start, Scalar end, Scalar step, ScalarType dtype, Layout, Device, bool pin_memory)
+            dtype = _get_arange_dtype(args[3])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], step=args[2], dtype=dtype
+        )
+        step = symbolic_helper._unsqueeze_helper(g, step, [0])
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        start = symbolic_helper._unsqueeze_helper(g, start, [0])
+        range_tensor = _float_step_convert(g.op("Div", g.op("Sub", end, start), step))
+        arange_tensor = symbolic_helper._squeeze_helper(
+            g, nonzero(g, ones(g, range_tensor, None, None, None)), [1]
+        )
+        arange_tensor = g.op("Add", g.op("Mul", arange_tensor, step), start)
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+    elif len(args) == 6:
+        # aten::arange(Scalar start, Scalar end, ScalarType dtype, Layout, Device, bool pin_memory)
+        dtype = _get_arange_dtype(args[2])
+        dtype, end, start, step = symbolic_helper._arange_cast_helper(
+            g, start=args[0], end=args[1], dtype=dtype
+        )
+        end = symbolic_helper._unsqueeze_helper(g, end, [0])
+        start = symbolic_helper._unsqueeze_helper(g, start, [0])
+        range_tensor = _float_step_convert(g.op("Sub", end, start))
+        arange_tensor = g.op(
+            "Add",
+            symbolic_helper._squeeze_helper(
+                g, nonzero(g, ones(g, range_tensor, dtype, *(args[3:]))), [1]
+            ),
+            start,
+        )
+        return g.op(
+            "Cast", arange_tensor, to_i=_type_utils.JitScalarType(dtype).onnx_type()
+        )
+
+    return symbolic_helper._unimplemented("aten::arange", f"with {len(args)} arguments")
+
+
+@_onnx_symbolic("aten::linspace")
+def linspace(
+    g: jit_utils.GraphContext, start, end, steps, dtype, layout, device, pin_memory
+):
+    range_tensor = symbolic_helper._arange_helper(g, steps, None)
+    step = div(
+        g,
+        sub(g, end, start),
+        sub(g, steps, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))),
+    )
+    return add(g, mul(g, range_tensor, step), start)
+
+
+@_onnx_symbolic("aten::lift")
+def lift(g: jit_utils.GraphContext, self):
+    # at::lift() is a no-op from the perspective of tracing for onnx
+    return self
+
+
+@_onnx_symbolic("aten::masked_fill")
+def masked_fill(g: jit_utils.GraphContext, self, mask, value):
+    """Implement the masked_fill functionality available for a pytorch tensor in ONNX.
+
+    Fills elements of the input tensor with `value` where `mask` is True.
+    """
+    mask = g.op("Cast", mask, to_i=_C_onnx.TensorProtoDataType.BOOL)
+    value = symbolic_helper._maybe_get_scalar(value)
+    return g.op("Where", mask, symbolic_helper._if_scalar_type_as(value, self), self)
+
+
+@_onnx_symbolic("aten::masked_fill_")
+def masked_fill_(g: jit_utils.GraphContext, self, mask, value):
+    return masked_fill(g, self, mask, value)
+
+
+@_onnx_symbolic("aten::index")
+def index(g: jit_utils.GraphContext, self, index):
+    if symbolic_helper._is_packed_list(index):
+        indices = symbolic_helper._unpack_list(index)
+    else:
+        indices = [index]
+
+    def try_mask_to_index(index):
+        if not symbolic_helper._is_none(index) and (
+            _type_utils.JitScalarType.from_value(
+                index, _type_utils.JitScalarType.UNDEFINED
+            )
+            == _type_utils.JitScalarType.UINT8
+            or symbolic_helper._is_bool(index)
+        ):
+            if g.opset < 9:
+                raise errors.SymbolicValueError(
+                    "Exporting masked indices are only supported after ONNX opset 9.",
+                    self,
+                )
+            warnings.warn(
+                "Exporting aten::index operator with indices of type Byte. "
+                "Only 1-D indices are supported. In any other case, "
+                "this will produce an incorrect ONNX graph."
+            )
+            index = symbolic_helper._squeeze_helper(g, nonzero(g, index), [1])
+        return index
+
+    indices = [try_mask_to_index(idx) for idx in indices]
+    if len(indices) == 1:
+        return symbolic_helper._select_helper(
+            g, self, 0, indices[0], apply_reshape=False
+        )
+    else:
+        # Multiple tensors as indices. Each tensor could either be
+        #   1. prim::Constant()
+        #           representing ":" in python indexing. E.g. tensor[:, :]
+        #   2. prim::Constant[value=...] or tensor output
+        #           representing advanced indexing. E.g. tensor[[0, 1], [2, 0]].
+        # For more info on advanced indexing,
+        # check https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#advanced-indexing
+
+        # Consider a general case of
+        #       t: [x_1, y_1, y_2, ..., x_m, ..., y_n]
+        # where t is a tensor of rank m+n, {x_i} are axes where tensor index is provided, and {y_i} are axes for ":".
+        # Same results can be achieved through transposing t into
+        #       t: [x_1, x_2, ..., x_m, y_1, y_2, ..., y_n]
+        # and use gatherND. However ONNX does not have gatherND, to use 1d gather we'll need to flatten t
+        # and process the tensor indices.
+        #       t: [x_1 * x_2 * ... * x_m, y_1 * y_2 * ... * y_n]
+        #       tensor index = \sum_{i=1}^m (ind_i * \prod_{j=i+1}^m (x_j))
+        # After gather, reshape and transpose back.
+        adv_idx_indices = [
+            i for i, idx in enumerate(indices) if not symbolic_helper._is_none(idx)
+        ]
+
+        if len(adv_idx_indices) == 0:
+            return self
+        elif len(adv_idx_indices) == 1:
+            return index_select(
+                g, self, adv_idx_indices[0], indices[adv_idx_indices[0]]
+            )
+        else:
+            rank = symbolic_helper._get_tensor_rank(self)
+            if rank is None:
+                return symbolic_helper._unimplemented(
+                    "aten::index",
+                    "operator of advanced indexing on tensor of unknown rank. ",
+                    self,
+                )
+            # TODO: If indexing is supported natively in ONNX in future opsets,
+            #       update the warning to recommend exporting with higher opset version.
+            warnings.warn(
+                "Exporting aten::index operator of advanced indexing in opset "
+                f"{GLOBALS.export_onnx_opset_version}"
+                " is achieved by combination of multiple ONNX operators, "
+                "including Reshape, Transpose, Concat, and Gather. "
+                "If indices include negative values, the exported graph will produce incorrect results."
+            )
+            adv_idx_count = len(adv_idx_indices)
+            shape_tensor = _shape_as_tensor(g, self)
+            dim_tensor_list = [
+                g.op(
+                    "Gather",
+                    shape_tensor,
+                    g.op("Constant", value_t=torch.LongTensor([dim])),
+                    axis_i=0,
+                )
+                for dim in range(rank)
+            ]
+
+            self = g.op(
+                "Transpose",
+                self,
+                perm_i=adv_idx_indices
+                + [i for i in range(rank) if i not in adv_idx_indices],
+            )
+            self = g.op("Flatten", self, axis_i=adv_idx_count)
+
+            # Note that tensor indices will be broadcasted while accumulating. Thus we get the final subarray shape as well.
+            cum_adv_index = indices[adv_idx_indices[-1]]
+            multiplier = dim_tensor_list[adv_idx_indices[-1]]
+            for i in range(adv_idx_count - 2, -1, -1):
+                adv_index = g.op("Mul", indices[adv_idx_indices[i]], multiplier)
+                cum_adv_index = g.op("Add", cum_adv_index, adv_index)
+                multiplier = g.op(
+                    "Mul", multiplier, dim_tensor_list[adv_idx_indices[i]]
+                )
+
+            # perform gather
+            self = index_select(g, self, 0, cum_adv_index)
+
+            cum_adv_index_shape_tensor = _shape_as_tensor(g, cum_adv_index)
+            # check if all advanced indices are consecutive.
+            # Refer to https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#combining-advanced-and-basic-indexing
+            # to understand how the subarray position is decided.
+            if adv_idx_indices == list(
+                range(adv_idx_indices[0], adv_idx_indices[-1] + 1)
+            ):
+                # unfold regular index axes
+                folded_adv_idx_shape_list = [
+                    g.op("Constant", value_t=torch.LongTensor([-1]))
+                ] + [
+                    dim_tensor_list[i] for i in range(rank) if i not in adv_idx_indices
+                ]
+                folded_adv_idx_shape = g.op(
+                    "Concat", *folded_adv_idx_shape_list, axis_i=0
+                )
+                self = symbolic_helper._reshape_helper(g, self, folded_adv_idx_shape)
+
+                # Transpose folded advanced indexed axis to its original location.
+                adv_idx_permute = (
+                    list(range(1, adv_idx_indices[0] + 1))
+                    + [0]
+                    + list(range(adv_idx_indices[0] + 1, rank - adv_idx_count + 1))
+                )
+                self = g.op("Transpose", self, perm_i=adv_idx_permute)
+
+                # unfold advanced index axes
+                final_shape_list = (
+                    [dim_tensor_list[i] for i in range(adv_idx_indices[0])]
+                    + [cum_adv_index_shape_tensor]
+                    + [
+                        dim_tensor_list[i]
+                        for i in range(adv_idx_indices[0], rank)
+                        if i not in adv_idx_indices
+                    ]
+                )
+                final_shape = g.op("Concat", *final_shape_list, axis_i=0)
+            else:
+                final_shape = g.op(
+                    "Concat",
+                    cum_adv_index_shape_tensor,
+                    *[
+                        dim_tensor_list[i]
+                        for i in range(rank)
+                        if i not in adv_idx_indices
+                    ],
+                    axis_i=0,
+                )
+
+            return symbolic_helper._reshape_helper(g, self, final_shape)
+
+
+@_onnx_symbolic("aten::linalg_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+def linalg_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.norm.html
+    ord_value = None
+    if dim is None:
+        if symbolic_helper._is_none(ord):
+            self = symbolic_helper._reshape_helper(g, self, [-1])
+            ord = g.op("Constant", value_t=torch.LongTensor([2]))
+        self_dim = symbolic_helper._get_tensor_rank(self)
+        if self_dim is None:
+            return symbolic_helper._unimplemented(
+                "dim", "Input rank must be known at export time.", self
+            )
+        if self_dim == 1:
+            ord_value = symbolic_helper._parse_arg(ord, "f")
+        else:
+            dim = [0, 1]
+    else:
+        if len(dim) == 1:
+            if symbolic_helper._is_none(ord):
+                ord = g.op("Constant", value_t=torch.LongTensor([2]))
+            ord_value = symbolic_helper._parse_arg(ord, "f")
+    if ord_value:
+        return linalg_vector_norm(g, self, ord_value, dim, keepdim, dtype)
+    return linalg_matrix_norm(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::linalg_vector_norm")
+@symbolic_helper.parse_args("v", "f", "is", "b", "v")
+def linalg_vector_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: float,
+    dim: Sequence[int] | None,
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    return symbolic_helper._linalg_vector_norm_helper(g, self, ord, dim, keepdim, dtype)
+
+
+@_onnx_symbolic("aten::linalg_matrix_norm")
+@symbolic_helper.parse_args("v", "v", "is", "b", "v")
+def linalg_matrix_norm(
+    g: jit_utils.GraphContext,
+    self: torch._C.Value,
+    ord: torch._C.Value,
+    dim: list[int],
+    keepdim: bool,
+    dtype: torch._C.Value,
+):
+    # Conditions based on https://pytorch.org/docs/stable/generated/torch.linalg.matrix_norm.html
+    ord_value = symbolic_helper._parse_arg(ord, "s")
+    if ord_value == "fro":
+        return frobenius_norm(g, self, dim, keepdim)
+    elif ord_value == "nuc":
+        return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==nuc", self)
+    else:
+        ord_value = symbolic_helper._parse_arg(ord, "f")
+        if ord_value is None:
+            return frobenius_norm(g, self, dim, keepdim)
+        if ord_value == 2 or ord_value == -2:
+            # ord = 2/-2 unimplemented due to lack of operators
+            # used to calculate singular values
+            return symbolic_helper._unimplemented("linalg.matrix_norm", "ord==2", self)
+        # Wrap the dim vector to handle negative dim values
+        self_dim = symbolic_helper._get_tensor_rank(self)
+        if self_dim is None:
+            return symbolic_helper._unimplemented(
+                "linalg.matrix_norm", "Input rank must be known at export time.", self
+            )
+        # Common implementation for cases with
+        # ord = 1/-1 and ord = inf/-inf
+        if dim[0] < 0:
+            dim[0] += self_dim
+        if dim[1] < 0:
+            dim[1] += self_dim
+
+        if ord_value == math.inf or ord_value == -math.inf:
+            dim[0], dim[1] = dim[1], dim[0]
+        if dim[1] > dim[0] and not keepdim:
+            dim[1] -= 1
+        sum = symbolic_helper._reducesum_helper(
+            g, g.op("Abs", self), axes_i=[dim[0]], keepdims_i=keepdim
+        )
+        if ord_value > 0:
+            result, indices = max(
+                g,
+                sum,
+                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
+                keepdim=keepdim,
+            )
+        else:
+            result, indices = min(
+                g,
+                sum,
+                dim_or_y=g.op("Constant", value_t=torch.LongTensor([dim[1]])),
+                keepdim=keepdim,
+            )
+        return result
+
+
+@_onnx_symbolic("aten::linalg_cross")
+@symbolic_helper.parse_args("v", "v", "i")
+def linalg_cross(g: jit_utils.GraphContext, input, other, dim=-1):
+    return cross(g, input, other, dim)
+
+
+@_onnx_symbolic("aten::frobenius_norm")
+@symbolic_helper.parse_args("v", "is", "b")
+def frobenius_norm(g: jit_utils.GraphContext, self, dim=None, keepdim=False):
+    sqr = g.op("Mul", self, self)
+    sumsqr = symbolic_helper._reducesum_helper(g, sqr, axes_i=dim, keepdims_i=keepdim)
+    return g.op("Sqrt", sumsqr)
+
+
+@_onnx_symbolic("aten::multinomial")
+@symbolic_helper.parse_args("v", "i", "b", "v")
+def multinomial(
+    g: jit_utils.GraphContext, input, num_samples, replacement=False, generator=None
+):
+    if generator is not None and not symbolic_helper._is_none(generator):
+        symbolic_helper._unimplemented(
+            "Multinomial", "generator is not supported for multinomial", input
+        )
+    if not replacement and num_samples > 1:
+        symbolic_helper._unimplemented(
+            "Multinomial",
+            "replacement=False when num_samples > 1 is not supported for multinomial",
+            input,
+        )
+
+    log_input = log(g, input)
+    return g.op(
+        "Multinomial",
+        log_input,
+        dtype_i=_C_onnx.TensorProtoDataType.INT64,
+        sample_size_i=num_samples,
+    )
+
+
+@_onnx_symbolic("aten::baddbmm")
+def baddbmm(g: jit_utils.GraphContext, self, batch1, batch2, beta, alpha):
+    scalar_type = _type_utils.JitScalarType.from_value(self)
+    batch_mul = matmul(g, batch1, batch2)
+    mul_a = mul(
+        g,
+        batch_mul,
+        g.op("Cast", alpha, to_i=scalar_type.onnx_type()),
+    )
+    mul_b = mul(
+        g,
+        self,
+        g.op("Cast", beta, to_i=scalar_type.onnx_type()),
+    )
+    return add(g, mul_a, mul_b)
+
+
+@_onnx_symbolic("aten::meshgrid")
+@symbolic_helper.parse_args("v", "s")
+def meshgrid(g: jit_utils.GraphContext, tensor_list, indexing: str | None = None):
+    if indexing is None:
+        indexing = "ij"
+    elif indexing not in {"ij", "xy"}:
+        raise errors.SymbolicValueError(
+            f"Unsupported indexing: {indexing}", tensor_list
+        )
+    unpacked_tensor_list = symbolic_helper._unpack_list(tensor_list)
+    if indexing == "xy":
+        unpacked_tensor_list[:2] = unpacked_tensor_list[1::-1]
+    tensors = [
+        symbolic_helper._reshape_helper(
+            g, t, g.op("Constant", value_t=torch.LongTensor([-1]))
+        )
+        for t in unpacked_tensor_list
+    ]
+    tensors_shape = [g.op("Shape", t) for t in tensors]
+    out_shape = g.op("Concat", *tensors_shape, axis_i=0)
+    out = []
+    for i, t in enumerate(tensors):
+        shape_i = [g.op("Constant", value_t=torch.ones(1, dtype=torch.int64))] * len(
+            tensors
+        )
+        shape_i[i] = tensors_shape[i]
+        t_reshaped = _reshape_from_tensor(g, t, g.op("Concat", *shape_i, axis_i=0))
+        out.append(g.op("Expand", t_reshaped, out_shape))
+    if indexing == "xy":
+        out[0], out[1] = out[1], out[0]
+    return g.op("prim::ListConstruct", *out)
+
+
+@_onnx_symbolic("aten::remainder")
+def remainder(g: jit_utils.GraphContext, input, other):
+    div = _floor_divide(g, input, other)
+    quo = g.op("Mul", div, other)
+    return g.op("Sub", input, quo)
+
+
+@_onnx_symbolic("aten::gelu")
+@symbolic_helper.parse_args("v", "s")
+def gelu(g: jit_utils.GraphContext, self: torch._C.Value, approximate: str = "none"):
+    if approximate == "tanh":
+        kBeta = math.sqrt(2 / math.pi)
+        kKappa = 0.044715
+
+        beta = torch.tensor(kBeta, dtype=torch.double)
+        kappa = torch.tensor(kKappa, dtype=torch.double)
+        one = torch.tensor(1.0, dtype=torch.double)
+        half = torch.tensor(0.5, dtype=torch.double)
+
+        self_cube = mul(g, self, mul(g, self, self))
+        inner = mul(g, beta, add(g, self, mul(g, kappa, self_cube)))
+        return mul(g, half, mul(g, self, add(g, one, g.op("Tanh", inner))))
+    else:
+        _sqrt2 = 1.4142135623730951
+        erf = g.op("Erf", g.op("Div", self, torch.tensor(_sqrt2, dtype=torch.double)))
+        erf_plusone = add(
+            g, erf, g.op("Constant", value_t=torch.tensor(1, dtype=torch.double))
+        )
+        return mul(
+            g,
+            mul(g, self, erf_plusone),
+            g.op("Constant", value_t=torch.tensor(0.5, dtype=torch.double)),
+        )
+
+
+@_onnx_symbolic("aten::group_norm")
+@symbolic_helper.quantized_args(True, False, False, False)
+@symbolic_helper.parse_args("v", "i", "v", "v", "f", "i")
+def group_norm(
+    g: jit_utils.GraphContext, input, num_groups, weight, bias, eps, cudnn_enabled
+):
+    channel_size = symbolic_helper._get_tensor_dim_size(input, 1)
+    if channel_size is not None:
+        assert channel_size % num_groups == 0
+    input_rank = symbolic_helper._get_tensor_rank(input)
+    if input_rank is None:
+        return symbolic_helper._unimplemented("group_norm", "unknown input rank", input)
+    # 0 in the shape list keeps dimension value unchanged.
+    shape = [0, num_groups, -1]
+    input_reshaped = symbolic_helper._reshape_helper(
+        g, input, g.op("Constant", value_t=torch.LongTensor(shape))
+    )
+
+    # C is always divisible by num_groups
+    # Due to shape difference. we need to apply weight and bias after
+    # instance norm computation and reshape
+    weight_ = g.op(
+        "Constant",
+        value_t=torch.tensor(
+            [1.0] * num_groups,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        ),
+    )
+    bias_ = g.op(
+        "Constant",
+        value_t=torch.tensor(
+            [0.0] * num_groups,
+            dtype=_type_utils.JitScalarType.from_value(input).dtype(),
+        ),
+    )
+
+    norm_reshaped = g.op(
+        "InstanceNormalization", input_reshaped, weight_, bias_, epsilon_f=eps
+    )
+    norm = symbolic_helper._reshape_helper(g, norm_reshaped, g.op("Shape", input))
+
+    if weight is None or weight.node().mustBeNone():
+        weight_value = torch.tensor(
+            [1.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
+        )
+        weight = g.op("Constant", value_t=weight_value)
+    if bias is None or bias.node().mustBeNone():
+        bias_value = torch.tensor(
+            [0.0], dtype=_type_utils.JitScalarType.from_value(input).dtype()
+        )
+        bias = g.op("Constant", value_t=bias_value)
+
+    # Norm has shape [N, C, *] so we reshape weight and bias to [C, *]
+    axes = list(range(1, input_rank - 1))
+    return add(
+        g,
+        mul(g, norm, symbolic_helper._unsqueeze_helper(g, weight, axes)),
+        symbolic_helper._unsqueeze_helper(g, bias, axes),
+    )
+
+
+@_onnx_symbolic("aten::_weight_norm")
+@symbolic_helper.parse_args("v", "v", "i")
+def _weight_norm(g: jit_utils.GraphContext, weight_v, weight_g, dim):
+    rank = symbolic_helper._get_tensor_rank(weight_v)
+    if rank is not None:
+        # W = g * ((v) / ||v||)
+        # Compute norm_except_dim for l2 norm. dim = None means over all dims
+        # torch's weight_norm module sets dim = -1 if it's None.
+        # This conflicts the logic for negative axes to access dims backwards
+        # TODO: Might need a fix in torch group_norm module
+        axes = list(range(rank))
+        if dim is not None:
+            if dim < -1:
+                dim += rank
+            if dim != -1:
+                axes.remove(dim)
+        norm_v = norm(g, weight_v, 2, axes, 1)
+        div = g.op("Div", weight_v, norm_v)
+        return g.op("Mul", div, weight_g)
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of _weight_norm for tensor of unknown rank.",
+        weight_v,
+    )
+
+
+@_onnx_symbolic("aten::dim")
+def dim(g: jit_utils.GraphContext, self):
+    """Implement the dim functionality available for a pytorch tensor in ONNX"""
+    # ONNX does not support dim directly in this opset so we can use 2 ops to get the info
+    shape = g.op("Shape", self)
+    return g.op("Size", shape)
+
+
+@_onnx_symbolic("aten::__contains_")
+def __contains_(g: jit_utils.GraphContext, self, element):
+    unpacked_list = symbolic_helper._unpack_list(self)
+    if all(
+        symbolic_helper._is_constant(x) for x in unpacked_list
+    ) and symbolic_helper._is_constant(element):
+        return g.op(
+            "Constant",
+            value_t=torch.tensor(
+                symbolic_helper._node_get(element.node(), "value")
+                in (symbolic_helper._node_get(x.node(), "value") for x in unpacked_list)
+            ),
+        )
+
+    raise errors.SymbolicValueError(
+        "Unsupported: ONNX export of __contains__ for non-constant list or element.",
+        self,
+    )
+
+
+@_onnx_symbolic("aten::__getitem_")
+def __getitem_(g: jit_utils.GraphContext, self, i):
+    return select(g, self, g.op("Constant", value_t=torch.tensor([0])), i)
+
+
+@_onnx_symbolic("aten::item")
+def item(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("aten::take")
+def take(g: jit_utils.GraphContext, self, index):
+    self_flattened = symbolic_helper._reshape_helper(
+        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    )
+    out = index_select(g, self_flattened, 0, index)
+    out = reshape_as(g, out, index)
+    return out
+
+
+def _kl_div_log_target_impl(g: jit_utils.GraphContext, input, target):
+    diff_ = sub(g, target, input)
+    exp_ = exp(g, target)
+    output = mul(g, exp_, diff_)
+    return output
+
+
+def _kl_div_non_log_target_impl(g: jit_utils.GraphContext, input, target):
+    log_ = log(g, target)
+    diff_ = sub(g, log_, input)
+    output_pos = mul(g, target, diff_)
+    zeros_ = zeros_like(g, output_pos)
+    mask_ = gt(g, target, g.op("Constant", value_t=torch.tensor(0)))
+    output = where(g, mask_, output_pos, zeros_)
+    return output
+
+
+@_onnx_symbolic("aten::kl_div")
+@symbolic_helper.parse_args("v", "v", "i", "b")
+def kl_div(g: jit_utils.GraphContext, input, target, reduction, log_target):
+    if log_target:
+        output = _kl_div_log_target_impl(g, input, target)
+    else:
+        output = _kl_div_non_log_target_impl(g, input, target)
+
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "kl_div with reduction other than none, mean, or sum.", input
+        )
+
+
+@_onnx_symbolic("aten::mse_loss")
+@symbolic_helper.parse_args("v", "v", "i")
+def mse_loss(g: jit_utils.GraphContext, input, target, reduction):
+    output = mul(g, sub(g, input, target), sub(g, input, target))
+    if reduction == 0:
+        return output
+    elif reduction == 1:
+        return g.op("ReduceMean", output, keepdims_i=0)
+    elif reduction == 2:
+        return symbolic_helper._reducesum_helper(g, output, keepdims_i=0)
+    else:
+        return symbolic_helper._onnx_unsupported(
+            "mse_loss with reduction other than none, mean, or sum.", input
+        )
+
+
+@_onnx_symbolic("aten::as_strided")
+@symbolic_helper.quantized_args(True)
+@symbolic_helper.parse_args("v", "v", "is", "i")
+def as_strided(g: jit_utils.GraphContext, self, sizes, strides, offset=None):
+    sizes = symbolic_helper._maybe_get_const(sizes, "is")
+    rank = len(strides)
+    self_1d = symbolic_helper._reshape_helper(
+        g, self, g.op("Constant", value_t=torch.tensor([-1], dtype=torch.int64))
+    )
+    ind: torch.Tensor | None
+    if not symbolic_helper._is_value(sizes):
+        ind = torch.tensor([0], dtype=torch.long)
+        for i, (size, stride) in enumerate(zip(sizes, strides)):
+            r_size = [1] * rank
+            r_size[i] = -1
+            ind = ind + torch.arange(size).view(r_size) * stride
+        if offset:
+            ind = ind + offset
+        return g.op("Gather", self_1d, g.op("Constant", value_t=ind))
+    else:
+        ind = None
+        for i, stride in enumerate(strides):
+            r_size = [1] * rank
+            r_size[i] = -1
+            size = select(
+                g,
+                sizes,
+                g.op("Constant", value_t=torch.tensor([0])),
+                g.op("Constant", value_t=torch.tensor(i)),
+            )
+            tmp_ind = symbolic_helper._reshape_helper(
+                g,
+                arange(g, size, 4, None, None, None),
+                g.op("Constant", value_t=torch.tensor(r_size)),
+            )
+            tmp_ind = g.op(
+                "Mul", tmp_ind, g.op("Constant", value_t=torch.tensor([stride]))
+            )
+            if ind is None:
+                ind = tmp_ind
+            else:
+                ind = g.op("Add", ind, tmp_ind)
+        if offset:
+            ind = g.op("Add", ind, g.op("Constant", torch.tensor([offset])))
+        return g.op("Gather", self_1d, ind)
+
+
+@_onnx_symbolic("aten::__derive_index")
+def __derive_index(g: jit_utils.GraphContext, index, start, step):
+    return g.op("Add", start, g.op("Mul", index, step))
+
+
+@_onnx_symbolic("aten::__range_length")
+# Source code for aten op can be found here: pytorch/torch/csrc/jit/runtime/register_prim_ops.cpp
+# if (step > 0 && lo < hi) {
+#   push(stack, 1 + (hi - 1 - lo) / step);
+# } else if (step < 0 && lo > hi) {
+#   push(stack, 1 + (lo - 1 - hi) / (0 - step));
+# } else {
+#  push(stack, 0);
+# }
+def __range_length(g: jit_utils.GraphContext, lo, hi, step):
+    sub = g.op("Sub", hi, lo)
+    div = g.op("Ceil", true_divide(g, sub, step))
+    return g.op("Cast", div, to_i=_C_onnx.TensorProtoDataType.INT64)
+
+
+@_onnx_symbolic("aten::linear")
+def linear(g: jit_utils.GraphContext, input, weight, bias):
+    rank = symbolic_helper._get_tensor_rank(input)
+    weight = t(g, weight)
+    if rank == 2 and not bias.node().mustBeNone():
+        alpha = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        beta = g.op("Constant", value_t=torch.tensor(1, dtype=torch.int64))
+        output = addmm(g, bias, input, weight, alpha, beta)
+    else:
+        output = matmul(g, input, weight)
+        if not bias.node().mustBeNone():
+            output = add(g, bias, output)
+
+    return output
+
+
+@_onnx_symbolic("aten::hann_window")
+@symbolic_helper.parse_args("v", "b", "i", "v", "v", "v", "v")
+def hann_window(
+    g: jit_utils.GraphContext,
+    window_length,
+    periodic=True,
+    dtype: int | None = None,
+    layout=None,
+    device=None,
+    pin_memory=None,
+    requires_grad=False,
+):
+    if dtype is None:
+        dtype_ = torch.get_default_dtype()
+        if not dtype_ or not dtype_.is_floating_point:
+            dtype_ = torch.float
+        scalar_type = _type_utils.JitScalarType.from_dtype(dtype_)
+    else:
+        scalar_type = _type_utils.JitScalarType(dtype)
+
+    n_array = arange(g, window_length, 4, None, None, None)
+    output = g.op("Cast", n_array, to_i=_C_onnx.TensorProtoDataType.FLOAT)
+    output = mul(
+        g, g.op("Constant", value_t=torch.tensor(math.pi, dtype=torch.float)), output
+    )
+
+    if periodic is False:
+        window_length = sub(
+            g, window_length, g.op("Constant", value_t=torch.tensor(1, dtype=torch.int))
+        )
+    output = div(g, output, window_length)
+    output = g.op(
+        "Cast",
+        square(g, sin(g, output)),
+        to_i=scalar_type.onnx_type(),
+    )
+
+    return output
+
+
+@_onnx_symbolic("aten::mv")
+def mv(g: jit_utils.GraphContext, self, vec):
+    return matmul(g, self, vec)
+
+
+@_onnx_symbolic("aten::dot")
+def dot(g: jit_utils.GraphContext, self, other):
+    return matmul(g, self, other)
+
+
+@_onnx_symbolic("aten::movedim")
+@symbolic_helper.parse_args("v", "t", "t")
+def movedim(g: jit_utils.GraphContext, self, source, destination):
+    # This is a pythonic implementation mostly taken from aten/src/ATen/native/TensorShape.cpp::movedim
+    source = source.view(-1)
+    destination = destination.view(-1)
+
+    assert source.size() == destination.size()
+
+    if (source == destination).all():
+        return self
+
+    self_rank = symbolic_helper._get_tensor_rank(self)
+    assert self_rank is not None
+
+    perm = list(range(self_rank))
+
+    src_dims = perm.copy()
+    dst_dims = perm.copy()
+
+    for src, dst in zip(source.tolist(), destination.tolist()):
+        perm[dst] = src
+        src_dims[src] = -1
+        dst_dims[dst] = -1
+
+    src_dims = [dim for dim in src_dims if dim != -1]
+    dst_dims = [dim for dim in dst_dims if dim != -1]
+
+    for src, dst in zip(src_dims, dst_dims):
+        perm[dst] = src
+
+    return g.op("Transpose", self, perm_i=perm)
+
+
+@_onnx_symbolic("aten::fill")
+@symbolic_helper.parse_args("v", "v")
+def fill(g: jit_utils.GraphContext, self, value):
+    scalar_type = _type_utils.JitScalarType.from_value(
+        self, _type_utils.JitScalarType.FLOAT
+    )
+    return full_like(g, self, value, scalar_type)
+
+
+@_onnx_symbolic("aten::index_add")
+def index_add(g: jit_utils.GraphContext, self, dim, index, other, alpha=None):
+    warnings.warn(
+        "Warning: ONNX export does not support duplicated values in 'index' field, "
+        + "this will cause the ONNX model to be incorrect."
+    )
+
+    # ONNX does not support "alpha" argument, unlike aten index_add
+    # See: https://github.com/pytorch/pytorch/pull/65993#issuecomment-953151102 for more context
+    if alpha and symbolic_helper._scalar(symbolic_helper._maybe_get_scalar(alpha)) != 1:
+        return symbolic_helper._unimplemented("index_add", "alpha != 1", self)
+
+    dim = symbolic_helper._maybe_get_const(dim, "i")
+    if dim is None:
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting 'index_add_()' function with "
+            "unknown 'dim' value.",
+            self,
+        )
+
+    self_dim_rank = symbolic_helper._get_tensor_rank(self)
+    other_dim_rank = symbolic_helper._get_tensor_rank(other)
+
+    if self_dim_rank is None or other_dim_rank is None:
+        raise errors.SymbolicValueError(
+            "ONNX export does NOT support exporting 'index_add_()' function while "
+            "the rank of self tensor or tensor to be added is unknown.",
+            self,
+        )
+
+    if other_dim_rank != self_dim_rank:
+        delta = self_dim_rank - other_dim_rank
+        for i in range(delta):
+            other = symbolic_helper._unsqueeze_helper(
+                g, other, [symbolic_helper._get_tensor_rank(other)]
+            )
+
+    other_dim_size = symbolic_helper._get_tensor_dim_size(other, dim)
+    self_dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
+
+    if (other_dim_size is not None) and (self_dim_size is not None):
+        if other_dim_size > self_dim_size:
+            raise errors.SymbolicValueError(
+                "ONNX export does not support exporting 'index_add_()' function with "
+                "duplicated values in 'index' parameter yet.",
+                self,
+            )
+
+    # Construct a new shape. It's almost as same as self except the size of the 'dim'
+    # dimension is 1, so that we can expand other dimensions as expected.
+    new_shape_axes = list(range(self_dim_rank))
+    new_shape_starts = [0 for i in range(self_dim_rank)]
+    new_shape_ends = [sys.maxsize if (i != dim) else 1 for i in range(self_dim_rank)]
+
+    new_shape = symbolic_helper._slice_helper(
+        g, self, axes=new_shape_axes, starts=new_shape_starts, ends=new_shape_ends
+    )
+    other = expand_as(g, other, new_shape)
+
+    for i in range(dim):
+        index = symbolic_helper._unsqueeze_helper(g, index, [0])
+
+    for i in range(self_dim_rank - dim - 1):
+        index = symbolic_helper._unsqueeze_helper(
+            g, index, [symbolic_helper._get_tensor_rank(index)]
+        )
+
+    return scatter_add(g, self, dim, expand_as(g, index, other), other)
+
+
+@_onnx_symbolic("aten::roll")
+@symbolic_helper.parse_args("v", "is", "is")
+def roll(g: jit_utils.GraphContext, self, shifts, dims):
+    assert len(shifts) == len(dims)
+
+    result = self
+    for i in range(len(shifts)):
+        shapes = []
+        shape = symbolic_helper._slice_helper(
+            g, result, axes=[dims[i]], starts=[-shifts[i]], ends=[sys.maxsize]
+        )
+        shapes.append(shape)
+        shape = symbolic_helper._slice_helper(
+            g, result, axes=[dims[i]], starts=[0], ends=[-shifts[i]]
+        )
+        shapes.append(shape)
+        result = g.op("Concat", *shapes, axis_i=dims[i])
+
+    return result
+
+
+@_onnx_symbolic("aten::cross")
+@symbolic_helper.parse_args("v", "v", "i")
+def cross(g: jit_utils.GraphContext, input, other, dim=None):
+    dim = symbolic_helper._get_dim_for_cross(input, dim)
+    # If we have two tensors such that
+    # A = [a, b, c], B = [d, e, f], we permute the tensor such that we have
+    # After first roll,
+    # A' = [b, c, a], B' = [f, d, e], so that we calculate (b*f, c*d, a*e)
+    roll_x_1 = roll(g, input, [2], [dim])
+    roll_y_1 = roll(g, other, [1], [dim])
+    # After second roll,
+    # A' = [c, a, b], B' = [e, f, d], so that we calculate (c*e, a*f, b*d)
+    roll_x_2 = roll(g, input, [1], [dim])
+    roll_y_2 = roll(g, other, [2], [dim])
+    # cross product is calculated as
+    # result = [(b*f - c*e), (c*d - a*f), (a*e - b*d)]
+    return sub(g, mul(g, roll_x_1, roll_y_1), mul(g, roll_x_2, roll_y_2))
+
+
+@_onnx_symbolic("aten::cdist")
+def cdist(
+    g: jit_utils.GraphContext,
+    x1,
+    x2,
+    p=2.0,
+    compute_mode="use_mm_for_euclid_dist_if_necessary",
+):
+    # X1.shape = (B * P * D), X2.shape = (B * R * D)
+    # In order to respect numpy style broadcasting as demonstrated in
+    # https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md
+    # we unsqueeze both input tensors
+    # Currently we ignore the 'compute_mode' variable as we use default to
+    # using matrix multiplication to calculate the euclidean distance
+    rank = symbolic_helper._get_tensor_rank(x1)
+    assert rank is not None
+    broadcasted_x1 = symbolic_helper._unsqueeze_helper(g, x1, [rank - 1])
+    broadcasted_x2 = symbolic_helper._unsqueeze_helper(g, x2, [rank - 2])
+    return pairwise_distance(
+        g, broadcasted_x1, broadcasted_x2, p, eps=1e-06, keepdim=False
+    )
+
+
+@_onnx_symbolic("aten::lerp")
+def lerp(g: jit_utils.GraphContext, self, end, weight):
+    # Conditional for better numeric. This has been discussed in
+    # https://github.com/pytorch/pytorch/pull/18871
+    diff = g.op("Sub", end, self)
+    return where(
+        g,
+        g.op("Less", weight, g.op("Constant", value_t=torch.tensor(0.5))),
+        g.op("Add", self, g.op("Mul", weight, diff)),
+        g.op(
+            "Sub",
+            end,
+            g.op(
+                "Mul",
+                diff,
+                g.op("Sub", g.op("Constant", value_t=torch.tensor(1.0)), weight),
+            ),
+        ),
+    )
+
+
+@_onnx_symbolic("aten::broadcast_tensors")
+def broadcast_tensors(g: jit_utils.GraphContext, self):
+    all_tensors = symbolic_helper._unpack_list(self)
+    t_with_final_shape = zeros_like(g, all_tensors[0])
+
+    # Add operator supports multidirectional broadcasting. So we leverage this function
+    # to infer the final shape generated by the broadcast.
+    for t in all_tensors:
+        t_with_final_shape = add(g, t_with_final_shape, t)
+
+    t_list = [expand_as(g, t, t_with_final_shape) for t in all_tensors]
+    return g.op("prim::ListConstruct", *t_list)
+
+
+@_onnx_symbolic("aten::is_pinned")
+def is_pinned(g: jit_utils.GraphContext, self, device=None):
+    # Unused by ONNX.
+    return None
+
+
+@_onnx_symbolic("prim::ConstantSplit")
+def prim_constant_split(g: jit_utils.GraphContext, self, split_size, dim):
+    size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if size is None:
+        return symbolic_helper._unimplemented(
+            "prim::ConstantSplit", "unknown dimension size", self
+        )
+    splits = [split_size] * (size // split_size)
+    leftover = size % split_size
+    if leftover:
+        splits.append(leftover)
+    return g.op("Split", self, split_i=splits, axis_i=dim, outputs=len(splits))
+
+
+# TODO: It would be better to export this as a chunk directly, as this is
+# less sensitive to changes in input size.
+# TODO: Once we have proper scoping, stop reimplementing chunk, delete this
+# method, and use the desugared version
+@_onnx_symbolic("prim::ConstantChunk")
+def prim_constant_chunk(g: jit_utils.GraphContext, self, chunks, dim):
+    dim_size = symbolic_helper._get_tensor_dim_size(self, dim)
+    if dim_size is None:
+        return symbolic_helper._unimplemented(
+            "prim::ConstantChunk", "unknown dimension size", self
+        )
+    split_size = (dim_size + chunks - 1) // chunks
+    return prim_constant_split(g, self, split_size, dim)
+
+
+@_onnx_symbolic("prim::shape")
+def prim_shape(g: jit_utils.GraphContext, self):
+    return g.op("Shape", self)
+
+
+@_onnx_symbolic("prim::max")
+def prim_max(g: jit_utils.GraphContext, self, other):
+    return symbolic_helper._op_with_optional_float_cast(
+        g, "Max", self, other, opset_before=12
+    )
+
+
+@_onnx_symbolic("prim::min")
+def prim_min(g: jit_utils.GraphContext, self, other=None):
+    if not other:
+        if symbolic_helper._is_packed_list(self):
+            self = stack(g, self, g.op("Constant", value_t=torch.tensor([0])))
+        return min(g, self)
+    return min(g, self, other)
+
+
+@_onnx_symbolic("prim::data")
+def prim_data(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("prim::layout")
+def prim_layout(g: jit_utils.GraphContext, self):
+    # Always return 'torch.strided'. Other layout types are not supported by JIT 'TensorType'.
+    # Layout class defined in 'c10/core/Layout.h'.
+    return g.op("Constant", value_t=torch.tensor(0))
+
+
+@_onnx_symbolic("prim::ListConstruct")
+def prim_list_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+@_onnx_symbolic("prim::ListUnpack")
+def prim_list_unpack(
+    g: jit_utils.GraphContext, *inputs, **kwargs
+) -> list[_C.Value] | None:
+    if len(inputs) == 1 and inputs[0].node().kind() == "prim::ListConstruct":
+        # Cancel the previous node if it is ListConstruct by returning its inputs
+        # TODO(justinchuby): Use a public method in the helper module
+        return symbolic_helper._unpack_list(inputs[0])
+
+    return None
+
+
+@_onnx_symbolic("prim::TupleConstruct")
+def prim_tuple_construct(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+@_onnx_symbolic("prim::Uninitialized")
+def prim_uninitialized(g: jit_utils.GraphContext, *inputs, **kwargs):
+    return None
+
+
+# exists to refine the type of the Value
+# if x is an optional Tensor, unchecked_cast will cast
+# x to Tensor, so the rest of the graph knows that x is a Tensor
+# this doesn't do anything in runtime and is a noop in ONNX
+@_onnx_symbolic("prim::unchecked_cast")
+def prim_unchecked_cast(g: jit_utils.GraphContext, self):
+    return self
+
+
+@_onnx_symbolic("prim::dtype")
+def prim_dtype(g: jit_utils.GraphContext, self):
+    scalar_type = symbolic_helper._try_get_scalar_type(self)
+    if scalar_type is None:
+        scalar_type = _type_utils.JitScalarType.FLOAT
+    # This node records a torch dtype as int
+    return g.op("Constant", value_t=torch.tensor(scalar_type))
+
+
+@_onnx_symbolic("prim::tolist")
+def prim_tolist(g: jit_utils.GraphContext, input, dim_val, elem_ty_val):
+    """tolist is currently supported only for 1D input tensors.
+
+    dim_val and elem_ty_val represent dimension and type annotations
+    that need to match dimension and type of the input tensor.
+    """
+    dim = symbolic_helper._maybe_get_const(dim_val, "i")
+    if dim > 1:
+        return symbolic_helper._unimplemented("prim::tolist", "dim_val > 1", input)
+    return input
+
+
+# -----------------------------------------------------------------------------
+# Symbolic functions that need extra context
+# -----------------------------------------------------------------------------
+@_onnx_symbolic("prim::device")
+def prim_device(g: jit_utils.GraphContext, *inputs, **kwargs) -> None:
+    output_type = g.original_node.output().type()
+    if isinstance(output_type, _C.DeviceObjType):
+        return None
+
+    return symbolic_helper._unimplemented(
+        "prim::device",
+        f"output type should be 'DeviceObjType', not '{output_type.kind()}'",
+        g.original_node.output(),
+    )
+
+
+@_onnx_symbolic("prim::Loop")
+def prim_loop(g: jit_utils.GraphContext, *inputs, **attrs) -> list[_C.Value]:
+    node = g.original_node
+    env = g.env
+    values_in_env = g.values_in_env
+    params_dict = g.params_dict
+
+    operator_export_type = GLOBALS.operator_export_type
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    old_blocks = tuple(node.blocks())
+    new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
+        g, "Loop", *inputs, outputs=node.outputsSize(), n_blocks=len(old_blocks)
+    )
+
+    for old_block, new_block_context in zip(old_blocks, new_block_contexts):
+        # Copy input metadata to subblock
+        #
+        #   prim::Loop(iter, cond, input_1, ..., input_n)
+        #     block0(iter, input_1, ..., input_n)
+        #
+        # For `Loop` node, copy metadata for `iter`, `input_1`, ..., `input_n`.
+        for i, b_in in enumerate(old_block.inputs()):
+            if i == 0 and i < len(inputs):
+                b_in.setType(inputs[i].type())
+            # For optional block inputs, they may switch between None not-None inside
+            # the loop body, so if the loop input is not optional, the block input may
+            # still need to be optional.
+            if (
+                i > 0
+                and (i + 1) < len(inputs)
+                and not isinstance(b_in.type(), _C.OptionalType)
+            ):
+                b_in.setType(inputs[i + 1].type())
+        torch._C._jit_pass_onnx_block(
+            old_block,
+            new_block_context.block,
+            operator_export_type,
+            env,
+            values_in_env,
+            False,
+        )
+    fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
+        new_node, opset_version
+    )
+    # Run shape type inference for Loop after subblock is converted.
+    if GLOBALS.onnx_shape_inference:
+        torch._C._jit_pass_onnx_node_shape_type_inference(
+            new_node, params_dict, opset_version
+        )
+    return fixed_outputs
+
+
+@_onnx_symbolic("prim::If")
+def prim_if(g: jit_utils.GraphContext, *inputs, **attrs) -> list[_C.Value]:
+    n = g.original_node
+    block = g.block
+    env = g.env
+    values_in_env = g.values_in_env
+    params_dict = g.params_dict
+
+    operator_export_type = GLOBALS.operator_export_type
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    static_if = inputs[0].node().kind() == "onnx::Constant"
+    if static_if:
+        # Fold static if
+        #
+        # The torch IR
+        # graph(%embedding_matrix.1 : Float(10, 15, strides=[15, 1], requires_grad=0, device=cpu),
+        #    %input.1 : Long(6, strides=[1], requires_grad=0, device=cpu), ...
+        # %65 : Bool(requires_grad=0, device=cpu) = prim::Constant[value={0}]()
+        # %21 : Long(device=cpu) = aten::eq(%20, %64)
+        # %22 : Long(device=cpu) = prim::If(%21)
+        #     block0():
+        #     %23 : Long(device=cpu) = aten::is_floating_point(%input.1)
+        #     -> (%23)
+        #     block1():
+        #     -> (%65)
+        # %input.53 : Tensor, %weight : Tensor = prim::If(%22)
+        #     block0():
+        #     -> (%embedding_matrix.1, %input.1)
+        #     block1():
+        #     -> (%input.1, %embedding_matrix.1)
+        # %26 : int[] = aten::size(%input.53)
+        #
+        # The converted ONNX graph
+        # %10 : Bool(device=cpu) = onnx::Constant[value={0}]()
+        # %14 : Bool(device=cpu) = onnx::Equal(%13, %8)
+        # %15 : Bool(requires_grad=0, device=cpu) = onnx::Constant[value={0}]()
+        # %16 : Long(1, strides=[1], device=cpu) = onnx::Shape(%input.1)
+        input_flag = symbolic_helper._node_get(inputs[0].node(), "value").tolist()
+        const_value = (
+            all(input_flag) if isinstance(input_flag, list) else bool(input_flag)
+        )
+        block_idx = 0 if const_value else 1
+        current_b = list(n.blocks())[block_idx]
+        env = torch._C._jit_pass_onnx_block(
+            current_b,
+            block,
+            operator_export_type,
+            env,
+            values_in_env,
+            True,
+        )
+        if_output_list = list(n.outputs())
+        current_b_list = list(current_b.outputs())
+
+        final_b_list = []
+        for idx in range(len(if_output_list)):
+            if current_b_list[idx] not in env:
+                raise errors.SymbolicValueError(
+                    f"The sub block ATen output {current_b_list[idx]} is not in env.",
+                    current_b_list[idx],
+                )  # type:ignore[operator]
+            onnx_b = env[current_b_list[idx]]
+            final_b_list.append(onnx_b)
+        return final_b_list
+    else:
+        old_blocks = tuple(n.blocks())
+        new_op_outputs, new_block_contexts, new_node = jit_utils.add_op_with_blocks(
+            g, "If", *inputs, outputs=n.outputsSize(), n_blocks=len(old_blocks)
+        )
+
+        for old_block, new_block_context in zip(old_blocks, new_block_contexts):
+            torch._C._jit_pass_onnx_block(
+                old_block,
+                new_block_context.block,
+                operator_export_type,
+                env,
+                values_in_env,
+                False,
+            )
+        fixed_outputs = torch._C._jit_pass_fixup_onnx_controlflow_node(
+            new_node, opset_version
+        )
+        # Run shape type inference for If after subblock is converted.
+        if GLOBALS.onnx_shape_inference:
+            torch._C._jit_pass_onnx_node_shape_type_inference(
+                new_node, params_dict, opset_version
+            )
+        return fixed_outputs
+
+
+@_onnx_symbolic("prim::Constant")
+def prim_constant(g: jit_utils.GraphContext, *inputs, **attrs):
+    node = g.original_node
+
+    if node.mustBeNone():
+        return None
+    # This must go before checking for string values, because some device constants
+    # have string values, but we want to keep them as unconverted Device types so
+    # that eq() can work on them.
+    if isinstance(node.output().type(), _C.DeviceObjType):
+        return None
+    if node.kindOf("value") == "t":
+        return g.op("Constant", value_t=symbolic_helper._node_get(node, "value"))
+    if node.kindOf("value") == "s":
+        return g.op("Constant", value_s=symbolic_helper._node_get(node, "value"))
+    if node.output().type().isSubtypeOf(
+        _C.ListType.ofInts()
+    ) or node.output().type().isSubtypeOf(_C.ListType.ofFloats()):
+        return g.op(
+            "Constant", value_t=torch.tensor(symbolic_helper._node_get(node, "value"))
+        )
+    if node.output().type().isSubtypeOf(_C.ListType.ofStrings()):
+        str_constants = [
+            g.op("Constant", value_s=s)
+            for s in symbolic_helper._node_get(node, "value")
+        ]
+        return g.op("prim::ListConstruct", *str_constants)
+
+    raise errors.SymbolicValueError(
+        f"Unsupported prim::Constant kind: '{node.kindOf('value')}'. "
+        f"Please send a bug report at {_constants.PYTORCH_GITHUB_ISSUES_URL}.",
+        node.output(),
+    )
+
+
+@_onnx_symbolic("prim::type")
+def prim_type(g: jit_utils.GraphContext, device_value: _C.Value, *args, **kwargs):
+    if device_value.node().kind() == "prim::device":
+        device = jit_utils.get_device_from_value(device_value.node().input())
+        if device is not None:
+            return g.op("Constant", value_s=str(device))
+
+    return symbolic_helper._unimplemented(
+        "prim::type",
+        "Device type cannot be statically determined.",
+        device_value,
+    )
+
+
+@_onnx_symbolic("onnx::Placeholder")
+def onnx_placeholder(g: jit_utils.GraphContext, *inputs, **attrs):
+    node = g.original_node
+    block = g.block
+    env = g.env
+    values_in_env = g.values_in_env
+
+    return torch._C._jit_onnx_convert_pattern_from_subblock(
+        block, node, env, values_in_env
+    )
+
+
+@_onnx_symbolic("aten::resolve_conj")
+@_onnx_symbolic("aten::resolve_neg")
+def noop_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
+    # ONNX does not have operators to *directly* manipulate real/imaginary components
+    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
+    # which results in failures due to missing operators for complex numbers
+
+    # `aten::resolve_conj` and `aten::resolve_neg` can safely be implemented as no-op
+    return input
+
+
+@_onnx_symbolic("aten::_conj")
+@_onnx_symbolic("aten::conj_physical")
+def unsupported_complex_operators(g: jit_utils.GraphContext, input: _C.Value):
+    # ONNX does not have operators to *directly* manipulate real/imaginary components
+    # However, a few torch APIs (e.g. .tolist()) use complex operations when input is real,
+    # which results in failures due to missing operators for complex numbers
+
+    # While `aten::_conj` and `aten::conj_physical` raise exception when input is complex
+    if symbolic_helper.is_complex_value(input):
+        # FIXME(justinchuby): report correct name for symbolic being executed
+        return symbolic_helper._onnx_unsupported(
+            "aten::_conj, aten::conj_physical",
+            input,
+        )
+
+    # they can safely be implemented as no-op for real numbers only
+    return noop_complex_operators(g, input)
+
+
+@_onnx_symbolic("aten::logit")
+def logit(g: jit_utils.GraphContext, self: torch._C.Value, eps: torch._C.Value):
+    one = g.op("Constant", value_t=torch.tensor(1.0))
+
+    if not symbolic_helper._is_none(eps):
+        eps = g.op(
+            "Cast", eps, to_i=_type_utils.JitScalarType.from_value(self).onnx_type()
+        )
+        one_sub_eps = g.op("Sub", one, eps)
+        self_less_equal_one_sub_eps = g.op("Greater", one_sub_eps, self)
+        temporary_self = g.op("Where", self_less_equal_one_sub_eps, self, one_sub_eps)
+
+        temporary_self_less_eps = g.op("Less", temporary_self, eps)
+        z = g.op("Where", temporary_self_less_eps, eps, temporary_self)
+    else:
+        z = self
+
+    sub = g.op("Sub", one, z)
+    div = g.op("Div", z, sub)
+    return g.op("Log", div)
diff --git a/.venv/Lib/site-packages/torch/onnx/utils.py b/.venv/Lib/site-packages/torch/onnx/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..20e106f87ee46630432905cb6b44cb0130782cb3
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/utils.py
@@ -0,0 +1,1990 @@
+# mypy: allow-untyped-defs
+"""Functions to export models into the ONNX IR format.
+
+These models can be loaded with the ONNX library and then
+converted to models which run on other deep learning frameworks.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import copy
+import inspect
+import re
+import typing
+import warnings
+from typing import Any, Callable, cast, Collection, Mapping, Sequence
+
+import torch
+import torch._C._onnx as _C_onnx
+import torch.jit._trace
+import torch.serialization
+from torch import _C
+from torch.onnx import (  # noqa: F401
+    _constants,
+    _deprecation,
+    _exporter_states,
+    errors,
+    symbolic_helper,
+)
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import diagnostics, jit_utils, onnx_proto_utils, registration
+
+
+__all__ = [
+    "is_in_onnx_export",
+    "select_model_mode_for_export",
+    "disable_apex_o2_state_dict_hook",
+    "setup_onnx_logging",
+    "exporter_context",
+    "export",
+    "model_signature",
+    "warn_on_static_input_change",
+    "unpack_quantized_tensor",
+    "export_to_pretty_string",
+    "unconvertible_ops",
+    "register_custom_op_symbolic",
+    "unregister_custom_op_symbolic",
+]
+
+
+def is_in_onnx_export() -> bool:
+    """Returns whether it is in the middle of ONNX export."""
+    return GLOBALS.in_onnx_export
+
+
+# TODO(justinchuby): Remove dependency to this global variable from constant_fold.cpp
+# Skip check due to cannot import IValue from torch._C
+_params_dict = {}  # type: ignore[var-annotated]
+
+
+@contextlib.contextmanager
+def select_model_mode_for_export(model, mode: _C_onnx.TrainingMode):
+    r"""A context manager to temporarily set the training mode of ``model``
+    to ``mode``, resetting it when we exit the with-block.
+
+    Args:
+        model: Same type and meaning as ``model`` arg to :func:`export`.
+        mode: Same type and meaning as ``training`` arg to :func:`export`.
+    """
+    if not isinstance(mode, _C_onnx.TrainingMode):
+        raise TypeError(
+            f"'mode' should be a torch.onnx.TrainingMode enum, but got '{type(mode)}'."
+        )
+    originally_training: bool = False
+
+    if hasattr(model, "training"):
+        originally_training = model.training
+
+        # ONNX opset 12 has better support for training amenable models, with updated
+        # versions of the dropout and batch_norm operators
+        if mode == _C_onnx.TrainingMode.TRAINING or (
+            mode == _C_onnx.TrainingMode.PRESERVE and originally_training
+        ):
+            GLOBALS.export_training = True
+            if GLOBALS.export_onnx_opset_version < 12:
+                warnings.warn(
+                    "You are exporting the model in training mode with onnx opset "
+                    f"version {GLOBALS.export_onnx_opset_version}. "
+                    "Opset versions lower than opset 12 will not be able to export "
+                    "nodes such as Dropout and BatchNorm correctly."
+                )
+        else:
+            GLOBALS.export_training = False
+
+        GLOBALS.training_mode = mode
+        if mode == _C_onnx.TrainingMode.TRAINING:
+            model.train(True)
+        elif mode == _C_onnx.TrainingMode.EVAL:
+            model.train(False)
+        # else mode == _C_onnx.TrainingMode.PRESERVE, do nothing
+
+    try:
+        yield
+    finally:
+        if hasattr(model, "training") and not mode == _C_onnx.TrainingMode.PRESERVE:
+            model.train(originally_training)
+
+
+@contextlib.contextmanager
+def disable_apex_o2_state_dict_hook(model: torch.nn.Module | torch.jit.ScriptFunction):
+    # Apex O2 hook state_dict to return fp16 weights as fp32.
+    # Exporter cannot identify them as same tensors.
+    # Since this hook is only used by optimizer, it is safe to
+    # remove this hook while exporting.
+    if not isinstance(model, torch.jit.ScriptFunction):
+        model_hooks = {}  # type: ignore[var-annotated]
+        for module in model.modules():
+            for key, hook in module._state_dict_hooks.items():
+                if type(hook).__name__ == "O2StateDictHook":
+                    if module not in model_hooks:
+                        model_hooks[module] = {}
+                    model_hooks[module][key] = hook
+            if module in model_hooks:
+                for key in model_hooks[module]:
+                    module._state_dict_hooks.pop(key)
+        try:
+            yield
+        finally:
+            # Add the hooks back
+            for module, m_map in model_hooks.items():
+                for key, hook in m_map.items():
+                    module._state_dict_hooks[key] = hook
+    else:
+        try:
+            yield
+        finally:
+            pass
+
+
+@contextlib.contextmanager
+def setup_onnx_logging(verbose: bool):
+    is_originally_enabled = torch.onnx.is_onnx_log_enabled()
+    if is_originally_enabled or verbose:
+        torch.onnx.enable_log()
+    try:
+        yield
+    finally:
+        if not is_originally_enabled:
+            torch.onnx.disable_log()
+
+
+@contextlib.contextmanager
+def exporter_context(model, mode: _C_onnx.TrainingMode, verbose: bool):
+    with select_model_mode_for_export(
+        model, mode
+    ) as mode_ctx, disable_apex_o2_state_dict_hook(
+        model
+    ) as apex_ctx, setup_onnx_logging(
+        verbose
+    ) as log_ctx, diagnostics.create_export_diagnostic_context() as diagnostic_ctx:
+        yield (mode_ctx, apex_ctx, log_ctx, diagnostic_ctx)
+
+
+def _get_torch_export_args(
+    args: tuple[Any, ...],
+    kwargs: dict[str, Any] | None,
+) -> tuple[tuple[Any, ...], dict[str, Any] | None]:
+    """Obtain the arguments for torch.onnx.export from the model and the input arguments."""
+    if not kwargs and args and isinstance(args[-1], dict):
+        kwargs = args[-1]
+        args = args[:-1]
+    return args, kwargs
+
+
+def export(
+    model: torch.nn.Module | torch.jit.ScriptModule | torch.jit.ScriptFunction,
+    args: tuple[Any, ...] | torch.Tensor,
+    f: str,
+    *,
+    kwargs: dict[str, Any] | None = None,
+    export_params: bool = True,
+    verbose: bool = False,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    input_names: Sequence[str] | None = None,
+    output_names: Sequence[str] | None = None,
+    operator_export_type: _C_onnx.OperatorExportTypes = _C_onnx.OperatorExportTypes.ONNX,
+    opset_version: int | None = None,
+    do_constant_folding: bool = True,
+    dynamic_axes: Mapping[str, Mapping[int, str]]
+    | Mapping[str, Sequence[int]]
+    | None = None,
+    keep_initializers_as_inputs: bool | None = None,
+    custom_opsets: Mapping[str, int] | None = None,
+    export_modules_as_functions: bool | Collection[type[torch.nn.Module]] = False,
+    autograd_inlining: bool = True,
+) -> None:
+    r"""Exports a model into ONNX format.
+
+    If ``model`` is not a :class:`torch.jit.ScriptModule` nor a
+    :class:`torch.jit.ScriptFunction`, this runs
+    ``model`` once in order to convert it to a TorchScript graph to be exported
+    (the equivalent of :func:`torch.jit.trace`). Thus this has the same limited support
+    for dynamic control flow as :func:`torch.jit.trace`.
+
+    Args:
+        model: The model to be exported.
+        args:
+
+            args can be structured either as:
+
+            1. ONLY A TUPLE OF ARGUMENTS::
+
+                args = (x, y, z)
+
+            The tuple should contain model inputs such that ``model(*args)`` is a valid
+            invocation of the model. Any non-Tensor arguments will be hard-coded into the
+            exported model; any Tensor arguments will become inputs of the exported model,
+            in the order they occur in the tuple.
+
+            2. A TENSOR::
+
+                args = torch.Tensor([1])
+
+            This is equivalent to a 1-ary tuple of that Tensor.
+
+            3. A TUPLE OF ARGUMENTS ENDING WITH A DICTIONARY OF NAMED ARGUMENTS::
+
+                args = (x, {"y": input_y, "z": input_z})
+
+            All but the last element of the tuple will be passed as non-keyword arguments,
+            and named arguments will be set from the last element. If a named argument is
+            not present in the dictionary, it is assigned the default value, or None if a
+            default value is not provided.
+
+            .. warning::
+                This behavior will be deprecated in a future release. Please use the
+                kwargs argument instead.
+
+            .. note::
+                If a dictionary is the last element of the args tuple, it will be
+                interpreted as containing named arguments. In order to pass a dict as the
+                last non-keyword arg, provide an empty dict as the last element of the args
+                tuple. For example, instead of::
+
+                    torch.onnx.export(
+                        model,
+                        (
+                            x,
+                            # WRONG: will be interpreted as named arguments
+                            {y: z},
+                        ),
+                        "test.onnx.pb",
+                    )
+
+                Write::
+
+                    torch.onnx.export(model, (x, {y: z}, {}), "test.onnx.pb")
+
+        f: Path to the output ONNX model file. E.g. "model.onnx".
+        kwargs: Named arguments to the model.
+        export_params: If True, all parameters will
+            be exported. Set this to False if you want to export an untrained model.
+            In this case, the exported model will first take all of its parameters
+            as arguments, with the ordering as specified by ``model.state_dict().values()``
+        verbose: if True, prints a description of the
+            model being exported to stdout. In addition, the final ONNX graph will include the
+            field ``doc_string``` from the exported model which mentions the source code locations
+            for ``model``. If True, ONNX exporter logging will be turned on.
+        training:
+            * ``TrainingMode.EVAL``: export the model in inference mode.
+            * ``TrainingMode.PRESERVE``: export the model in inference mode if model.training is
+                False and in training mode if model.training is True.
+            * ``TrainingMode.TRAINING``: export the model in training mode. Disables optimizations
+                which might interfere with training.
+        input_names (list of str, default empty list): names to assign to the
+            input nodes of the graph, in order.
+        output_names (list of str, default empty list): names to assign to the
+            output nodes of the graph, in order.
+        operator_export_type (enum, default OperatorExportTypes.ONNX):
+
+            .. warning::
+                This option will be deprecated in a future release. Future exported
+                graphs will always use the default opset domain.
+
+            * ``OperatorExportTypes.ONNX``: Export all ops as regular ONNX ops
+                (in the default opset domain).
+            * ``OperatorExportTypes.ONNX_FALLTHROUGH``: Try to convert all ops
+                to standard ONNX ops in the default opset domain. If unable to do so
+                (e.g. because support has not been added to convert a particular torch op to ONNX),
+                fall back to exporting the op into a custom opset domain without conversion. Applies
+                to `custom ops <https://pytorch.org/tutorials/advanced/torch_script_custom_ops.html>`_
+                as well as ATen ops. For the exported model to be usable, the runtime must support
+                these non-standard ops.
+            * ``OperatorExportTypes.ONNX_ATEN``: All ATen ops (in the TorchScript namespace "aten")
+                are exported as ATen ops (in opset domain "org.pytorch.aten").
+                `ATen <https://pytorch.org/cppdocs/#aten>`_ is PyTorch's built-in tensor library, so
+                this instructs the runtime to use PyTorch's implementation of these ops.
+
+                .. warning::
+
+                    Models exported this way are probably runnable only by Caffe2.
+
+                    This may be useful if the numeric differences in implementations of operators are
+                    causing large differences in behavior between PyTorch and Caffe2 (which is more
+                    common on untrained models).
+
+            * ``OperatorExportTypes.ONNX_ATEN_FALLBACK``: Try to export each ATen op
+                (in the TorchScript namespace "aten") as a regular ONNX op. If we are unable to do so
+                (e.g. because support has not been added to convert a particular torch op to ONNX),
+                fall back to exporting an ATen op. See documentation on OperatorExportTypes.ONNX_ATEN for
+                context.
+                For example::
+
+                    graph(%0 : Float):
+                    %3 : int = prim::Constant[value=0]()
+                    # conversion unsupported
+                    %4 : Float = aten::triu(%0, %3)
+                    # conversion supported
+                    %5 : Float = aten::mul(%4, %0)
+                    return (%5)
+
+                Assuming ``aten::triu`` is not supported in ONNX, this will be exported as::
+
+                    graph(%0 : Float):
+                    %1 : Long() = onnx::Constant[value={0}]()
+                    # not converted
+                    %2 : Float = aten::ATen[operator="triu"](%0, %1)
+                    # converted
+                    %3 : Float = onnx::Mul(%2, %0)
+                    return (%3)
+
+                .. warning::
+
+                    Models exported this way are probably runnable only by Caffe2.
+
+        opset_version (int, default 17): The version of the
+            `default (ai.onnx) opset <https://github.com/onnx/onnx/blob/master/docs/Operators.md>`_
+            to target. Must be >= 7 and <= 17.
+        do_constant_folding: Apply the constant-folding optimization.
+            Constant-folding will replace some of the ops that have all constant inputs
+            with pre-computed constant nodes.
+        dynamic_axes:
+
+            By default the exported model will have the shapes of all input and output tensors
+            set to exactly match those given in ``args``. To specify axes of tensors as
+            dynamic (i.e. known only at run-time), set ``dynamic_axes`` to a dict with schema:
+
+            * KEY (str): an input or output name. Each name must also be provided in ``input_names`` or
+                ``output_names``.
+            * VALUE (dict or list): If a dict, keys are axis indices and values are axis names. If a
+                list, each element is an axis index.
+
+            For example::
+
+                class SumModule(torch.nn.Module):
+                    def forward(self, x):
+                        return torch.sum(x, dim=1)
+
+
+                torch.onnx.export(
+                    SumModule(),
+                    (torch.ones(2, 2),),
+                    "onnx.pb",
+                    input_names=["x"],
+                    output_names=["sum"],
+                )
+
+            Produces::
+
+                input {
+                  name: "x"
+                  ...
+                      shape {
+                        dim {
+                          dim_value: 2  # axis 0
+                        }
+                        dim {
+                          dim_value: 2  # axis 1
+                ...
+                output {
+                  name: "sum"
+                  ...
+                      shape {
+                        dim {
+                          dim_value: 2  # axis 0
+                ...
+
+            While::
+
+                torch.onnx.export(
+                    SumModule(),
+                    (torch.ones(2, 2),),
+                    "onnx.pb",
+                    input_names=["x"],
+                    output_names=["sum"],
+                    dynamic_axes={
+                        # dict value: manually named axes
+                        "x": {0: "my_custom_axis_name"},
+                        # list value: automatic names
+                        "sum": [0],
+                    },
+                )
+
+            Produces::
+
+                input {
+                  name: "x"
+                  ...
+                      shape {
+                        dim {
+                          dim_param: "my_custom_axis_name"  # axis 0
+                        }
+                        dim {
+                          dim_value: 2  # axis 1
+                ...
+                output {
+                  name: "sum"
+                  ...
+                      shape {
+                        dim {
+                          dim_param: "sum_dynamic_axes_1"  # axis 0
+                ...
+
+        keep_initializers_as_inputs: If True, all the
+            initializers (typically corresponding to parameters) in the
+            exported graph will also be added as inputs to the graph. If False,
+            then initializers are not added as inputs to the graph, and only
+            the non-parameter inputs are added as inputs.
+            This may allow for better optimizations (e.g. constant folding) by
+            backends/runtimes.
+
+            If True, `deduplicate_initializers` pass will not be executed. This means
+            initializers with duplicated values will not be deduplicated and
+            will be treated as distinct inputs to the graph. This allows different
+            input initializers to be supplied at the runtime following export.
+
+            If ``opset_version < 9``, initializers MUST be part of graph
+            inputs and this argument will be ignored and the behavior will be
+            equivalent to setting this argument to True.
+
+        custom_opsets (dict[str, int], default empty dict): A dict with schema:
+
+            * KEY (str): opset domain name
+            * VALUE (int): opset version
+
+            If a custom opset is referenced by ``model`` but not mentioned in this dictionary,
+            the opset version is set to 1. Only custom opset domain name and version should be
+            indicated through this argument.
+
+        export_modules_as_functions: Flag to enable
+            exporting all ``nn.Module`` forward calls as local functions in ONNX. Or a set to indicate the
+            particular types of modules to export as local functions in ONNX.
+            This feature requires ``opset_version`` >= 15, otherwise the export will fail. This is because
+            ``opset_version`` < 15 implies IR version < 8, which means no local function support.
+            Module variables will be exported as function attributes. There are two categories of function
+            attributes.
+
+            1. Annotated attributes: class variables that have type annotations via
+            `PEP 526-style <https://www.python.org/dev/peps/pep-0526/#class-and-instance-variable-annotations>`_
+            will be exported as attributes.
+            Annotated attributes are not used inside the subgraph of ONNX local function because
+            they are not created by PyTorch JIT tracing, but they may be used by consumers
+            to determine whether or not to replace the function with a particular fused kernel.
+
+            2. Inferred attributes: variables that are used by operators inside the module. Attribute names
+            will have prefix "inferred::". This is to differentiate from predefined attributes retrieved from
+            python module annotations. Inferred attributes are used inside the subgraph of ONNX local function.
+
+            * ``False`` (default): export ``nn.Module`` forward calls as fine grained nodes.
+            * ``True``: export all ``nn.Module`` forward calls as local function nodes.
+            * Set of type of nn.Module: export ``nn.Module`` forward calls as local function nodes,
+                only if the type of the ``nn.Module`` is found in the set.
+
+        autograd_inlining: Flag used to control whether to inline autograd functions.
+            Refer to https://github.com/pytorch/pytorch/pull/74765 for more details.
+
+    Raises:
+        :class:`torch.onnx.errors.CheckerError`: If the ONNX checker detects an invalid ONNX graph.
+        :class:`torch.onnx.errors.UnsupportedOperatorError`: If the ONNX graph cannot be exported because it
+            uses an operator that is not supported by the exporter.
+        :class:`torch.onnx.errors.OnnxExporterError`: Other errors that can occur during export.
+            All errors are subclasses of :class:`errors.OnnxExporterError`.
+    """
+    if operator_export_type != _C_onnx.OperatorExportTypes.ONNX:
+        warnings.warn(
+            "Setting `operator_export_type` to something other than default is deprecated. "
+            "The option will be removed in a future release.",
+            category=FutureWarning,
+        )
+    if training == _C_onnx.TrainingMode.TRAINING:
+        warnings.warn(
+            "Setting `training` to something other than default is deprecated. "
+            "The option will be removed in a future release. Please set the training mode "
+            "before exporting the model.",
+            category=FutureWarning,
+        )
+
+    args = (args,) if isinstance(args, torch.Tensor) else args
+    if kwargs is not None:
+        args = args + (kwargs,)
+
+    _export(
+        model,
+        args,
+        f,
+        export_params,
+        verbose,
+        training,
+        input_names,
+        output_names,
+        operator_export_type=operator_export_type,
+        opset_version=opset_version,
+        do_constant_folding=do_constant_folding,
+        dynamic_axes=dynamic_axes,
+        keep_initializers_as_inputs=keep_initializers_as_inputs,
+        custom_opsets=custom_opsets,
+        export_modules_as_functions=export_modules_as_functions,
+        autograd_inlining=autograd_inlining,
+    )
+
+    return None
+
+
+def _is_constant_tensor_list(node):
+    if node.kind() != "prim::Constant":
+        return False
+    output_type = node.output().type()
+    if output_type.isSubtypeOf(_C.ListType.ofTensors()):
+        return True
+    if output_type.isSubtypeOf(_C.ListType(_C.OptionalType.ofTensor())):
+        return True
+
+
+# ONNX can't handle constants that are lists of tensors, which can
+# get generated in constant prop. So we split them back into prim::ListConstructs
+
+
+def _split_tensor_list_constants(g, block):
+    for node in block.nodes():
+        for subblock in node.blocks():
+            _split_tensor_list_constants(g, subblock)
+        if _is_constant_tensor_list(node):
+            inputs = []
+            for val in node.output().toIValue():
+                input = g.insertConstant(val)
+                input.node().moveBefore(node)
+                input.node().copyMetadata(node)
+                inputs.append(input)
+
+            lc = (
+                g.create("prim::ListConstruct", inputs)
+                .insertBefore(node)
+                .output()
+                .setType(_C.ListType.ofTensors())
+            )
+            lc.node().copyMetadata(node)
+            node.output().replaceAllUsesWith(lc)
+
+
+def _optimize_graph(
+    graph: _C.Graph,
+    operator_export_type: _C_onnx.OperatorExportTypes,
+    _disable_torch_constant_prop: bool = False,
+    fixed_batch_size: bool = False,
+    params_dict=None,
+    dynamic_axes=None,
+    input_names=None,
+    module=None,
+):
+    if params_dict is None:
+        params_dict = {}
+
+    # Inline everything
+    _C._jit_pass_inline(graph)
+
+    # Remove fork/wait nodes
+    _C._jit_pass_inline_fork_wait(graph)
+    _C._jit_pass_lint(graph)
+    if GLOBALS.autograd_inlining:
+        _C._jit_pass_onnx_autograd_function_process(graph)
+    _C._jit_pass_lower_all_tuples(graph)
+
+    # we now record some ops like ones/zeros
+    # into a trace where we previously recorded constants.
+    # use constant prop to maintain our current level of onnx support
+    # without implementing symbolics for all of them
+    if _disable_torch_constant_prop is False:
+        _C._jit_pass_constant_propagation(graph)
+
+    _split_tensor_list_constants(graph, graph)
+    # run dce to eliminate dead parts of the graph that might have been
+    # left behind by things like symbolic_override
+    _C._jit_pass_dce(graph)
+    _C._jit_pass_lint(graph)
+
+    # CSE should improve perf when Autocast is used with disabled cache
+    # Autocast is disabled due to a limitation on tracer as described at https://github.com/pytorch/pytorch/issues/84092
+    # Must run before _C._jit_pass_erase_number_types to prevent type substitution
+    if _C._jit_pass_cse(graph):
+        _C._jit_pass_onnx_lint(graph)
+
+    _C._jit_pass_canonicalize_graph_fuser_ops(graph)
+    _C._jit_pass_lint(graph)
+    _C._jit_pass_peephole(graph, True)
+    _C._jit_pass_fuse_addmm(graph)
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_peephole(graph, True)
+    _C._jit_pass_lower_all_tuples(graph)
+    # in _jit_pass_onnx, symbolic functions are called for each node for conversion.
+    # However, there are nodes that cannot be converted without additional context.
+    # For example, the number of outputs from split (and whether it is static or dynamic) is unknown
+    # until the point where it is unpacked by listUnpack node.
+    # This pass does a preprocess, and prepares the nodes such that enough context can be received
+    # by the symbolic function.
+    _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
+    _C._jit_pass_onnx_preprocess(graph)
+
+    # onnx does not support tuples, so try to remove them
+    _C._jit_pass_lint(graph)
+
+    # onnx only supports tensors, but 1 / 2 = 0.5 and tensor(1) / tensor(2) = 0
+    _C._jit_pass_prepare_division_for_onnx(graph)
+
+    _C._jit_pass_onnx_remove_print(graph)
+    _C._jit_pass_onnx_preprocess_caffe2(graph)
+
+    symbolic_helper._quantized_ops.clear()
+    # Unpack quantized weights for conv and linear ops and insert into graph.
+    _C._jit_pass_onnx_unpack_quantized_weights(graph, params_dict)
+    # onnx only supports tensors, so we turn all out number types into tensors
+    _C._jit_pass_erase_number_types(graph)
+    if GLOBALS.onnx_shape_inference:
+        input_names = [] if input_names is None else input_names
+        dynamic_axes = {} if dynamic_axes is None else dynamic_axes
+        _C._jit_pass_onnx_set_dynamic_input_shape(graph, dynamic_axes, input_names)
+    _C._jit_pass_onnx_lint(graph)
+
+    graph = _C._jit_pass_onnx(graph, operator_export_type)
+    _C._jit_pass_onnx_lint(graph)
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_onnx_scalar_type_analysis(
+        graph, True, GLOBALS.export_onnx_opset_version
+    )
+    _C._jit_pass_lint(graph)
+
+    _C._jit_pass_onnx_peephole(
+        graph, GLOBALS.export_onnx_opset_version, fixed_batch_size
+    )
+    _C._jit_pass_lint(graph)
+
+    # graph is not a valid jit graph anymore because types have been replaced
+    # (e.g. int with Tensor), so it now contains operators that don't actually
+    # exist. We can't run normal dead code elimination because it'd fail trying
+    # to look up if an operator has side effects, but we can run a dead code
+    # elimination variant that doesn't need to look up if an op has side effects.
+    _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+    _C._jit_pass_lint(graph)
+    graph = _C._jit_pass_canonicalize(graph)
+    _C._jit_pass_lint(graph)
+    if GLOBALS.onnx_shape_inference:
+        _C._jit_pass_onnx_graph_shape_type_inference(
+            graph, params_dict, GLOBALS.export_onnx_opset_version
+        )
+
+    return graph
+
+
+def warn_on_static_input_change(input_states):
+    """Warns that changes to input dictionaries and strings won't take effect in the traced ONNX graph.
+
+    We accept dictionaries and strings as ONNX inputs, but they should be only for
+    configuration use. we detect here if these inputs are modified, and if so we warn
+    the user that the changes won't take effect in the traced ONNX graph.
+    """
+    for input, traced_input in zip(input_states[0], input_states[1]):
+        if isinstance(input, dict):
+            if list(input.keys()) != list(traced_input.keys()):
+                warning = (
+                    "We detected that you are modifying a dictionary that is an input to your "
+                    "model. "
+                    "Note that dictionaries are allowed as inputs in ONNX but they should be "
+                    "handled with care. "
+                    "Usages of dictionaries is not recommended, and should not be used except "
+                    "for configuration use. "
+                    "Also note that the order and values of the keys must remain the same. "
+                )
+                warnings.warn(warning)
+        elif isinstance(input, str):
+            if input != traced_input:
+                warning = (
+                    "The model seems to have string inputs/outputs. "
+                    "Note that strings will not appear as inputs/outputs of the ONNX graph. "
+                )
+                warnings.warn(warning)
+
+
+def _resolve_args_by_export_type(arg_name, arg_value, operator_export_type):
+    """Resolves the arguments that are ignored when export_type != operator_export_type.ONNX."""
+    return arg_value
+
+
+def _decide_keep_init_as_input(
+    keep_initializers_as_inputs: bool | None,
+    operator_export_type: _C_onnx.OperatorExportTypes,
+    opset_version: int,
+):
+    """Decides whether the initializers in the graph should be listed as ONNX graph inputs.
+
+    This method encapsulates the logic to decide whether the initializers in the graph
+    should be listed as ONNX graph inputs (i.e., whether to choose ONNX IR v3 or v4).
+    If keep_initializers_as_inputs is not specified (None), then we decide whether to keep
+    initializers as graph inputs (val_keep_init_as_ip) based on export type. If export type
+    is ONNX, then do not keep initializers as input (val_keep_init_as_ip=False). For all other
+    export types keep initializers as input (val_keep_init_as_ip=True).
+    If keep_initializers_as_inputs is specified, then respect it. Unless opset version <= 8,
+    in which case it must be ignored because for opset version <= 8, all initializers MUST be
+    part of graph input (only ONNX IR v3 is allowed), i.e. val_keep_init_as_ip=True.
+
+    Special handling is needed for opset version 8 or lower, because irrespective
+    of user input for keep_initializers_as_inputs, the graph must follow ONNX IR v3
+    semantics, i.e. all initializers must be listed as ONNX graph input.
+    """
+
+    if opset_version < 9:
+        if keep_initializers_as_inputs is False:
+            warnings.warn(
+                "Setting 'keep_initializers_as_inputs=False' for opset version"
+                "8 or lower would lead to an invalid ONNX graph. Therefore, "
+                "'keep_initializers_as_inputs=False' is ignored during export."
+                "Exported model will have initializers as graph inputs (compliant "
+                " to ONNX IR v3)."
+            )
+        return True  # i.e. True == initializers are part of graph input (ONNX IR v3)
+    val_keep_init_as_ip = (
+        True if keep_initializers_as_inputs is None else keep_initializers_as_inputs
+    )
+    if (
+        keep_initializers_as_inputs is None
+        and operator_export_type is _C_onnx.OperatorExportTypes.ONNX
+    ):
+        val_keep_init_as_ip = False
+    return val_keep_init_as_ip
+
+
+def _decide_add_node_names(add_node_names, operator_export_type):
+    return _resolve_args_by_export_type(
+        "add_node_names", add_node_names, operator_export_type
+    )
+
+
+def _decide_constant_folding(do_constant_folding, operator_export_type, training):
+    do_constant_folding = _resolve_args_by_export_type(
+        "do_constant_folding", do_constant_folding, operator_export_type
+    )
+    if do_constant_folding and (
+        training is not None and training is not _C_onnx.TrainingMode.EVAL
+    ):
+        warnings.warn(
+            "It is recommended that constant folding be turned off ('do_constant_folding=False') "
+            "when exporting the model in training-amenable mode, i.e. with 'training=TrainingMode.TRAIN' "
+            "or 'training=TrainingMode.PRESERVE' (when model is in training mode). Otherwise, some "
+            "learnable model parameters may not translate correctly in the exported ONNX model "
+            "because constant folding mutates model parameters. Please consider "
+            "turning off constant folding or setting the training=TrainingMode.EVAL."
+        )
+    return do_constant_folding
+
+
+def _signature(model) -> inspect.Signature:
+    should_be_callable = getattr(model, "forward", model)
+    if callable(should_be_callable):
+        return inspect.signature(should_be_callable)
+    raise ValueError("model has no forward method and is not callable")
+
+
+def _decide_input_format(model, args):
+    try:
+        sig = _signature(model)
+    except ValueError as e:
+        warnings.warn(f"{e}, skipping _decide_input_format")
+        return args
+    try:
+        ordered_list_keys = list(sig.parameters.keys())
+        if ordered_list_keys[0] == "self":
+            ordered_list_keys = ordered_list_keys[1:]
+        args_dict: dict = {}
+        if isinstance(args, list):
+            args_list = args
+        elif isinstance(args, tuple):
+            args_list = list(args)
+        else:
+            args_list = [args]
+        if isinstance(args_list[-1], dict):
+            args_dict = args_list[-1]
+            args_list = args_list[:-1]
+        n_nonkeyword = len(args_list)
+        for optional_arg in ordered_list_keys[n_nonkeyword:]:
+            if optional_arg in args_dict:
+                args_list.append(args_dict[optional_arg])
+            # Check if this arg has a default value
+            else:
+                param = sig.parameters[optional_arg]
+                if param.default != param.empty:
+                    args_list.append(param.default)
+        args = args_list if isinstance(args, list) else tuple(args_list)
+    # Cases of models with no input args
+    except IndexError:
+        warnings.warn("No input args, skipping _decide_input_format")
+    except Exception as e:
+        warnings.warn(f"Skipping _decide_input_format\n {e.args[0]}")
+    return args
+
+
+def _from_dynamic_axes_to_dynamic_shapes(
+    model,
+    dynamic_axes: Mapping[str, Mapping[int, str]]
+    | Mapping[str, Sequence[int]]
+    | None = None,
+    input_names: Sequence[str] | None = None,
+) -> dict[str, Any] | None:
+    """
+
+    dynamic_axes examples:
+    (1) dynamic_axes = {"x": {0: "my_custom_axis_name_1"}, "y": {1: "my_custom_axis_name_2"}}
+    (2) dynamic_axes = {"x": [0], "y": [1]}
+
+    these will be converted to dynamic_shapes respectively:
+    (1) dynamic_shapes = {"x": {0: Dim("my_custom_axis_name_1")}, "y": {1: Dim("my_custom_axis_name_2")}}
+    (2) dynamic_shapes = {"x": {0: Dim("x_dim_0")}, "y": {1: Dim("y_dim_1")}}  # auto-generated dim names
+
+    """
+    if dynamic_axes is None:
+        return None
+
+    if input_names is None:
+        input_names_set = set()
+    else:
+        input_names_set = set(input_names)
+
+    dynamic_shapes: dict[str, Any | None] = {}
+    for input_name, axes in dynamic_axes.items():
+        if input_name in input_names_set:
+            raise ValueError(
+                "Assinging new input names is not supported yet. Please use model forward signature "
+                "to specify input names in dynamix_axes."
+            )
+        if isinstance(axes, dict):
+            dynamic_shapes[input_name] = {
+                k: torch.export.Dim(v) for k, v in axes.items()
+            }
+        elif isinstance(axes, list):
+            dynamic_shapes[input_name] = {
+                k: torch.export.Dim(f"{input_name}_dim_{k}") for k in axes
+            }
+        else:
+            raise TypeError(
+                f"dynamic_axes value must be either a dict or a list, but got {type(axes)}"
+            )
+    # torch.export.export needs static dim to present in dynamic_shapes
+    # for all input tensors, so we need to add them with None
+    try:
+        sig = _signature(model)
+    except ValueError as e:
+        warnings.warn(f"{e}, skipping auto filling None on static axes...")
+        return dynamic_shapes
+    for input_name in sig.parameters.keys():
+        if input_name not in dynamic_shapes:
+            dynamic_shapes[input_name] = None
+    return dynamic_shapes
+
+
+def _trace(func, args, operator_export_type, return_outs=False):
+    # Special case for common case of passing a single Tensor
+    if isinstance(args, torch.Tensor):
+        args = (args,)
+
+    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
+        func,
+        args,
+        strict=False,
+        _force_outplace=False,
+        _return_inputs_states=True,
+    )
+    warn_on_static_input_change(inputs_states)
+
+    trace_graph = _optimize_graph(trace_graph, operator_export_type, params_dict={})
+    if return_outs:
+        return trace_graph, torch_out
+    return trace_graph
+
+
+def _trace_and_get_graph_from_model(model, args):
+    # A basic sanity check: make sure the state_dict keys are the same
+    # before and after running the model.  Fail fast!
+    orig_state_dict_keys = torch.jit._unique_state_dict(model).keys()
+
+    # Disable Autocast cache because it replaces kernel's weight and bias
+    # by (undesired) constants.
+    # No perf impact for when there are reused weights since https://github.com/pytorch/pytorch/pull/85665
+    prev_autocast_cache_enabled = torch.is_autocast_cache_enabled()
+    torch.set_autocast_cache_enabled(False)
+    trace_graph, torch_out, inputs_states = torch.jit._get_trace_graph(
+        model,
+        args,
+        strict=False,
+        _force_outplace=False,
+        _return_inputs_states=True,
+    )
+    torch.set_autocast_cache_enabled(prev_autocast_cache_enabled)
+
+    warn_on_static_input_change(inputs_states)
+
+    if orig_state_dict_keys != torch.jit._unique_state_dict(model).keys():
+        raise RuntimeError(
+            "state_dict changed after running the tracer; "
+            "something weird is happening in your model!"
+        )
+
+    return trace_graph, torch_out
+
+
+def _get_param_count_list(method_graph, args_params):
+    param_count_list = []
+    for input_, arg_params_ in zip(method_graph.inputs(), args_params):
+        if "PackedParams" in str(input_.type()):
+            in_vars, _ = torch.jit._flatten(arg_params_)
+            param_count_list.append(len(in_vars))
+        else:
+            param_count_list.append(arg_params_ is not None)
+
+    return param_count_list
+
+
+def _check_flatten_did_not_remove(original, jit_flattened):
+    """torch.jit._flatten removes None. Check if it did so in this case."""
+
+    def flatten(x):
+        if isinstance(x, (list, tuple)):
+            for inner in x:
+                yield from flatten(inner)
+        elif isinstance(x, dict):
+            for inner in x.values():
+                yield from flatten(inner)
+        else:
+            yield x
+
+    flattened_with_none = list(flatten(original))
+    num_none = len(flattened_with_none) - len(jit_flattened)
+    assert num_none >= 0
+    if num_none:
+        raise ValueError(
+            f"args contained {num_none} None's after flattening. "
+            "When exporting a ScriptModule or ScriptFunction, no args may "
+            "be None because that breaks type propagation."
+        )
+
+
+def _create_jit_graph(
+    model: torch.nn.Module | torch.jit.ScriptFunction, args: Sequence[Any]
+) -> tuple[_C.Graph, list[_C.IValue], Any | None, _C.ScriptModule | None]:
+    if isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule)):
+        flattened_args = tuple(torch.jit._flatten(tuple(args))[0])
+        _check_flatten_did_not_remove(args, flattened_args)
+        torch_out = None
+
+        if isinstance(model, torch.jit.ScriptModule):
+            try:
+                graph = model.forward.graph  # type: ignore[attr-defined]
+            except AttributeError as e:
+                raise RuntimeError("'forward' method must be a script method") from e
+            _C._jit_pass_onnx_function_substitution(graph)
+            freezed_module = _C._freeze_module(
+                cast(_C.ScriptModule, model._c), preserveParameters=True
+            )
+            module, params = _C._jit_onnx_list_model_parameters(freezed_module)
+            method_graph = module._get_method("forward").graph
+            args_params = tuple(args) + tuple(params)
+            param_count_list = _get_param_count_list(method_graph, args_params)
+            in_vars, _ = torch.jit._flatten(args_params)
+            graph = _C._propagate_and_assign_input_shapes(
+                method_graph, tuple(in_vars), param_count_list, False, False
+            )
+            return graph, params, torch_out, module
+
+        # torch.jit.ScriptFunction
+        params = []
+        graph = model.graph
+        _C._jit_pass_onnx_function_substitution(graph)
+        param_count_list = _get_param_count_list(graph, args)
+        graph = _C._propagate_and_assign_input_shapes(
+            graph, flattened_args, param_count_list, False, False
+        )
+        return graph, params, torch_out, None
+
+    graph, torch_out = _trace_and_get_graph_from_model(model, args)
+    _C._jit_pass_onnx_lint(graph)
+    state_dict = torch.jit._unique_state_dict(model)
+    params = list(state_dict.values())
+    graph_inputs = list(graph.inputs())
+    user_input_num = len(graph_inputs) - len(state_dict)
+    param_names = list(state_dict.keys())
+    for i, inp in enumerate(graph_inputs):
+        if i >= user_input_num:
+            inp.setDebugName(param_names[i - user_input_num])
+    _C._jit_pass_onnx_function_substitution(graph)
+    return graph, params, torch_out, None
+
+
+def _get_named_param_dict(graph, params):
+    input_and_param_names = [val.debugName() for val in graph.inputs()]
+    param_names = input_and_param_names[len(input_and_param_names) - len(params) :]
+    _params_dict = dict(zip(param_names, params))
+    return _params_dict
+
+
+def _get_example_outputs(model, args):
+    input_args = copy.deepcopy(args)
+    input_kwargs = {}
+    if input_args and isinstance(input_args[-1], dict):
+        input_kwargs = input_args[-1]
+        input_args = input_args[:-1]
+
+    example_outputs = model(*input_args, **input_kwargs)
+    if isinstance(example_outputs, list):
+        example_outputs = [example_outputs]
+    elif not isinstance(example_outputs, tuple):
+        example_outputs = (example_outputs,)
+
+    return example_outputs
+
+
+_qtype_vtype_map = {
+    torch.quint8: torch.uint8,
+    torch.qint8: torch.int8,
+    torch.qint32: torch.int32,
+    torch.quint4x2: torch.int8,
+}
+
+
+def unpack_quantized_tensor(value, cast_onnx_accepted=True):
+    if isinstance(value, torch.Tensor) and value.dtype in _qtype_vtype_map:
+        q_value_dequantize = value.dequantize()
+        q_scale = (
+            torch.tensor(value.q_scale(), dtype=torch.double)
+            if cast_onnx_accepted
+            else torch.tensor(value.q_scale(), dtype=torch.float32)
+        )
+        q_zero_point = (
+            torch.tensor(value.q_zero_point(), dtype=torch.int64)
+            if cast_onnx_accepted
+            else torch.tensor(value.q_zero_point(), dtype=_qtype_vtype_map[value.dtype])
+        )
+        q_value = q_value_dequantize / q_scale + q_zero_point
+        q_value = q_value.to(dtype=_qtype_vtype_map[value.dtype])
+        return q_value, q_scale, q_zero_point
+    else:
+        return (value,)
+
+
+def _pre_trace_quant_model(model, args):
+    r"""Returns `torch.jit.trace(model, args)` if model is quantized. Otherwise do nothing and return
+    original model.
+
+    This is due to https://github.com/pytorch/pytorch/issues/75761.
+    """
+    if any(
+        hasattr(m, "_packed_params") for m in getattr(model, "modules", list)()
+    ) or any(getattr(arg, "is_quantized", False) for arg in args):
+        return torch.jit.trace(model, args)
+    return model
+
+
+def _model_to_graph(
+    model,
+    args,
+    verbose=False,
+    input_names=None,
+    output_names=None,
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+    do_constant_folding=True,
+    _disable_torch_constant_prop=False,
+    fixed_batch_size=False,
+    training=_C_onnx.TrainingMode.EVAL,
+    dynamic_axes=None,
+) -> tuple[
+    _C.Graph,
+    dict[str, torch.Tensor],
+    torch.Tensor
+    | tuple[torch.Tensor, ...]
+    | list[torch.Tensor]
+    | dict[str, torch.Tensor]
+    | Any
+    | None,
+]:
+    """Converts model into an ONNX graph.
+
+    Returns:
+        graph: A TorchScript IR Graph with ONNX nodes.
+        params_dict: Dict from input param name to param value.
+        torch_out: The output tensors resulting from the trace of ``model``.
+            If ``model`` is a :class:`torch.jit.ScriptModule` or :class:`torch.jit.ScriptFunction`,
+            this will be None, since we are not doing any tracing.
+    """
+    # TODO: can we simplify this to always return a tuple of Tensor or None?
+
+    # Special case for common case of passing a single Tensor
+    if isinstance(args, (torch.Tensor, int, float, bool)):
+        args = (args,)
+
+    model = _pre_trace_quant_model(model, args)
+    graph, params, torch_out, module = _create_jit_graph(model, args)
+    params_dict = _get_named_param_dict(graph, params)
+
+    try:
+        graph = _optimize_graph(
+            graph,
+            operator_export_type,
+            _disable_torch_constant_prop=_disable_torch_constant_prop,
+            fixed_batch_size=fixed_batch_size,
+            params_dict=params_dict,
+            dynamic_axes=dynamic_axes,
+            input_names=input_names,
+            module=module,
+        )
+    except Exception as e:
+        torch.onnx.log("Torch IR graph at exception: ", graph)
+        raise
+
+    is_script = isinstance(model, (torch.jit.ScriptFunction, torch.jit.ScriptModule))
+    if is_script:
+        example_outputs = _get_example_outputs(model, args)
+        example_outputs_final = ()
+        for example_output in example_outputs:
+            example_outputs_final += unpack_quantized_tensor(example_output)
+        out_vars, desc = torch.jit._flatten(example_outputs_final)
+        _C._jit_pass_onnx_assign_output_shape(
+            graph,
+            out_vars,
+            desc,
+            GLOBALS.onnx_shape_inference,
+            is_script,
+            GLOBALS.export_onnx_opset_version,
+        )
+
+    # NB: ONNX requires complete information about output types, which might be
+    # erased by some optimizations, so we need to set it explicitly again.
+    else:
+        if not isinstance(torch_out, (list, tuple)):
+            output_wrapped = [torch_out]
+        else:
+            output_wrapped = torch_out  # type: ignore[assignment]
+
+        output_tensors, out_desc = torch.jit._flatten(tuple(output_wrapped))
+        # assign_output_shape pass is not compatible with quantized outputs.
+        # Quantized outputs are flattened to 3 values in ONNX, while packed as
+        # single value in PyTorch.
+        if not any(getattr(out, "is_quantized", False) for out in output_tensors):
+            _C._jit_pass_onnx_assign_output_shape(
+                graph,
+                output_tensors,
+                out_desc,
+                GLOBALS.onnx_shape_inference,
+                is_script,
+                GLOBALS.export_onnx_opset_version,
+            )
+
+    _set_input_and_output_names(graph, input_names, output_names)
+    params_dict = _get_named_param_dict(graph, params)
+
+    if (
+        do_constant_folding
+        and GLOBALS.export_onnx_opset_version
+        >= _constants.ONNX_CONSTANT_FOLDING_MIN_OPSET
+    ):
+        if training is None or training == _C_onnx.TrainingMode.EVAL:
+            params_dict = _C._jit_pass_onnx_eval_peephole(graph, params_dict)
+
+        params_dict = _C._jit_pass_onnx_constant_fold(
+            graph, params_dict, GLOBALS.export_onnx_opset_version
+        )
+        _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+
+    if GLOBALS.onnx_shape_inference:
+        _C._jit_pass_onnx_graph_shape_type_inference(
+            graph, params_dict, GLOBALS.export_onnx_opset_version
+        )
+
+    params_dict = _C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
+
+    # For ONNX opset < 9, constants only have three data types: float16, float, double.
+    # In this pass transform constants of other data types to float/double + cast operator.
+    if GLOBALS.export_onnx_opset_version < 9:
+        _C._jit_pass_onnx_cast_all_constant_to_floating(graph)
+
+    params_dict = _C._jit_pass_filter_non_tensor_arguments(params_dict)
+    _C._jit_decay_packed_param_input_types(graph)
+
+    # If output names lack a proper name and are identified only by their unique
+    # give them a legible name for debugging purposes
+    _apply_friendly_debug_names(graph, params_dict)
+
+    return graph, params_dict, torch_out
+
+
+@torch._disable_dynamo
+@_deprecation.deprecated("2.5", "the future", "use onnx.printer.to_text() instead")
+def export_to_pretty_string(
+    model,
+    args,
+    export_params=True,
+    verbose=False,
+    training=_C_onnx.TrainingMode.EVAL,
+    input_names=None,
+    output_names=None,
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+    export_type=None,
+    google_printer=False,
+    opset_version=None,
+    keep_initializers_as_inputs=None,
+    custom_opsets=None,
+    add_node_names=True,
+    do_constant_folding=True,
+    dynamic_axes=None,
+):
+    """Similar to :func:`export`, but returns a text representation of the ONNX model.
+
+    Only differences in args listed below. All other args are the same
+    as :func:`export`.
+
+    Args:
+        add_node_names (bool, default True): Whether or not to set
+            NodeProto.name. This makes no difference unless
+            ``google_printer=True``.
+        google_printer (bool, default False): If False, will return a custom,
+            compact representation of the model. If True will return the
+            protobuf's `Message::DebugString()`, which is more verbose.
+
+    Returns:
+        A UTF-8 str containing a human-readable representation of the ONNX model.
+    """
+    if opset_version is None:
+        opset_version = _constants.ONNX_DEFAULT_OPSET
+    if custom_opsets is None:
+        custom_opsets = {}
+    GLOBALS.export_onnx_opset_version = opset_version
+    GLOBALS.operator_export_type = operator_export_type
+
+    with exporter_context(model, training, verbose):
+        val_keep_init_as_ip = _decide_keep_init_as_input(
+            keep_initializers_as_inputs, operator_export_type, opset_version
+        )
+        val_add_node_names = _decide_add_node_names(
+            add_node_names, operator_export_type
+        )
+        val_do_constant_folding = _decide_constant_folding(
+            do_constant_folding, operator_export_type, training
+        )
+        args = _decide_input_format(model, args)
+        graph, params_dict, torch_out = _model_to_graph(
+            model,
+            args,
+            verbose,
+            input_names,
+            output_names,
+            operator_export_type,
+            val_do_constant_folding,
+            training=training,
+            dynamic_axes=dynamic_axes,
+        )
+
+        return graph._pretty_print_onnx(  # type: ignore[attr-defined]
+            params_dict,
+            opset_version,
+            False,
+            operator_export_type,
+            google_printer,
+            val_keep_init_as_ip,
+            custom_opsets,
+            val_add_node_names,
+        )
+
+
+@_deprecation.deprecated("2.5", "the future", "avoid using this function")
+def unconvertible_ops(
+    model,
+    args,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    opset_version: int | None = None,
+) -> tuple[_C.Graph, list[str]]:
+    """Returns an approximated list of all ops that are yet supported by :mod:`torch.onnx`.
+
+    The list is approximated because some ops may be removed during the conversion
+    process and don't need to be converted. Some other ops may have partial support
+    that will fail conversion with particular inputs. Please open a Github Issue
+    for op support requests.
+
+    Args:
+        model: Same as the `model` parameter in :func:`torch.onnx.export`.
+        args: Same as the `args` parameter in :func:`torch.onnx.export`.
+        training: Same as the `training` parameter in :func:`torch.onnx.export`.
+        opset_version: Same as the `opset_version` parameter in :func:`torch.onnx.export`.
+
+    Returns:
+        The JIT graph and a list of unconvertible ops in the format of "domain::op".
+    """
+
+    opset_version = opset_version or _constants.ONNX_DEFAULT_OPSET
+    GLOBALS.export_onnx_opset_version = opset_version
+
+    try:
+        with exporter_context(model, training, verbose=False):
+            # Create a mostly clean JIT graph that contains the plain aten and
+            # other ops we can check with the symbolic registry.
+            # NOTE: We don't want to actually convert any ops to ONNX or run any
+            # symbolic functions because there is a higher chance that a pass
+            # fails or an unconvertible op messes up the graph during ONNX conversion.
+            # This way we can always generate a list just by looking at the names
+            # of the ops in the graph.
+            args = _decide_input_format(model, args)
+            model = _pre_trace_quant_model(model, args)
+            graph, _, _, module = _create_jit_graph(model, args)
+            _C._jit_pass_inline(graph)
+            _C._jit_pass_onnx_remove_inplace_ops_for_onnx(graph, module)
+            _C._jit_pass_erase_number_types(graph)
+            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+    except Exception as e:
+        raise errors.OnnxExporterError(
+            "Failed to discover unconvertible ops because of errors during the JIT graph "
+            "generation process."
+        ) from e
+
+    unsupported_ops = []
+    for node in graph.nodes():
+        domain_op = node.kind()
+        if domain_op.startswith(("onnx::", "prim::")):
+            # We consider onnx and prim ops as supported ops, even though some "prim"
+            # ops are not implemented as symbolic functions, because they may be
+            # eliminated in the conversion passes. Users may still see errors caused
+            # by prim ops even though they don't show up in the list.
+            continue
+        if not registration.registry.is_registered_op(
+            domain_op.rstrip("_"), opset_version
+        ):
+            # We consider all registered ops supported, even though some of them are
+            # only partially supported, because there is not yet a good way to check
+            # if an op is fully supported.
+            # TODO(justinchuby): Create a way to check if an op is fully supported.
+            unsupported_ops.append(domain_op)
+    return graph, unsupported_ops
+
+
+def _setup_trace_module_map(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    export_modules_as_functions: bool | Collection[type[torch.nn.Module]],
+) -> set[str]:
+    def __register_attribute_hook():
+        attr_name = "_onnx_attrs"
+
+        def _track_module_attributes_forward_pre_hook(module, input):
+            setattr(module, attr_name, _get_module_attributes(module))
+
+        def _track_module_attributes_forward_hook(module, input, output):
+            tracing_state = _C._get_tracing_state()
+            if not tracing_state:
+                return
+
+            graph = tracing_state.graph()
+            onnx_attrs = {}
+            if hasattr(module, attr_name):
+                onnx_attrs = getattr(module, attr_name)
+                delattr(module, attr_name)
+
+            _C._jit_pass_onnx_track_scope_attributes(graph, onnx_attrs)
+
+        for m in model.modules():
+            m.register_forward_hook(_track_module_attributes_forward_hook)
+            m.register_forward_pre_hook(_track_module_attributes_forward_pre_hook)
+
+    def _unqualified_variable_name(qualified_name: str) -> str:
+        """
+        Parse qualified variable name and return the unqualified version.
+
+        Pure numeric atoms are considered inadequate, so this function will look past them,
+        and start from the first non-numeric atom.
+
+        Example:
+            >>> _unqualified_variable_name("__main__.Foo.bar")
+            'bar'
+            >>> _unqualified_variable_name("__main__.Foo.bar.0")
+            'bar.0'
+        """
+        name_atoms = qualified_name.split(".")
+        for i, atom in reversed(list(enumerate(name_atoms))):
+            if not atom.isnumeric():
+                return ".".join(name_atoms[i:])
+        return qualified_name
+
+    trace_module_map = {
+        _m: torch._C._jit_onnx_create_full_scope_name(
+            torch.typename(type(_m)), _unqualified_variable_name(_n)
+        )
+        for _n, _m in model.named_modules()
+    }
+    torch.jit._trace._trace_module_map = trace_module_map
+    if isinstance(export_modules_as_functions, bool) and export_modules_as_functions:
+        module_typenames = {torch.typename(type(module)) for module in trace_module_map}
+    elif isinstance(export_modules_as_functions, set) and export_modules_as_functions:
+
+        def _find_typename(v):
+            if isinstance(v, type):
+                return torch.typename(v)
+            else:
+                raise RuntimeError(
+                    "Only type of the `nn.Module` should be "
+                    "passed in the set for argument `export_modules_as_functions`. "
+                    f"Got `{type(v).__name__}`."
+                )
+
+        module_typenames = {_find_typename(v) for v in export_modules_as_functions}
+    else:
+        module_typenames = set()
+
+    if module_typenames:
+        __register_attribute_hook()
+
+    return module_typenames
+
+
+def _reset_trace_module_map():
+    torch.jit._trace._trace_module_map = None
+    _C._jit_pass_onnx_clear_scope_records()
+
+
+def _get_module_attributes(module):
+    annotations = typing.get_type_hints(type(module))
+    base_m_annotations = typing.get_type_hints(torch.nn.Module)
+    [annotations.pop(k, None) for k in base_m_annotations]
+    # Check whether module attributes can be accessed. Some classes
+    # define attributes but don't provide access to them in their
+    # constructor.
+    #
+    # For example, torch.nn.Embedding has the `freeze` variable and its
+    # type specified in the class but the attribute is not created in the
+    # constructor. In other words, there is no `self.freeze = <True | False>`
+    # in the constructor.
+    #
+    # Reference: https://github.com/pytorch/pytorch/blob/92de1d322223fb5584e384971b32c46b93bc2f4b/torch/nn/modules/sparse.py#L120
+    attrs = {}
+    for k in annotations:
+        try:
+            attrs[k] = getattr(module, k)
+        except AttributeError:
+            torch.onnx.log(f"Skipping module attribute '{k}'")
+            continue
+    return attrs
+
+
+def _export(
+    model,
+    args,
+    f,
+    export_params=True,
+    verbose=False,
+    training=_C_onnx.TrainingMode.EVAL,
+    input_names=None,
+    output_names=None,
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+    export_type=None,
+    opset_version=None,
+    do_constant_folding=True,
+    dynamic_axes=None,
+    keep_initializers_as_inputs=None,
+    fixed_batch_size=False,
+    custom_opsets=None,
+    add_node_names=True,
+    onnx_shape_inference=True,
+    export_modules_as_functions: Any = False,
+    autograd_inlining=True,
+):
+    assert GLOBALS.in_onnx_export is False
+
+    if export_type is None:
+        export_type = _exporter_states.ExportTypes.PROTOBUF_FILE
+
+    if isinstance(model, torch.nn.DataParallel):
+        raise ValueError(
+            "torch.nn.DataParallel is not supported by ONNX "
+            "exporter, please use 'attribute' module to "
+            "unwrap model from torch.nn.DataParallel. Try "
+            "torch.onnx.export(model.module, ...)"
+        )
+
+    GLOBALS.onnx_shape_inference = onnx_shape_inference
+
+    if opset_version is None:
+        opset_version = _constants.ONNX_DEFAULT_OPSET
+
+    # torch.onnx.export does not support opset versions >=18
+    if opset_version > _constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET:
+        # We do not want to fail because we should still allow users to create
+        # custom symbolic functions for opset>17
+        warnings.warn(
+            f"Exporting to ONNX opset version {opset_version} is not supported. "
+            f"by 'torch.onnx.export()'. "
+            f"The highest opset version supported is {_constants.ONNX_TORCHSCRIPT_EXPORTER_MAX_OPSET}. "
+            f"To use a newer opset version, consider 'torch.onnx.export(..., dynamo=True)'. ",
+            category=errors.OnnxExporterWarning,
+        )
+
+    if export_modules_as_functions and opset_version < 15:
+        raise ValueError(
+            "`export_modules_as_functions` is not supported for `opset_version` < 15."
+            "This is because `opset_version` < 15 implies IR version < 8, which means "
+            "no local function support. "
+        )
+    if not operator_export_type:
+        operator_export_type = _C_onnx.OperatorExportTypes.ONNX
+
+    # By default, training=TrainingMode.EVAL,
+    # which is good because running a model in training mode could result in
+    # internal buffers getting updated, dropout getting applied, etc.
+    # If you really know what you're doing, you can turn
+    # training=TrainingMode.TRAINING or training=TrainingMode.PRESERVE,
+    # (to preserve whatever the original training mode was.)
+    GLOBALS.export_onnx_opset_version = opset_version
+    GLOBALS.operator_export_type = operator_export_type
+
+    try:
+        GLOBALS.in_onnx_export = True
+        _autograd_inlining_previous = GLOBALS.autograd_inlining
+        GLOBALS.autograd_inlining = autograd_inlining
+
+        module_typenames_to_export_as_functions: set[str] = set()
+        if isinstance(model, (torch.nn.Module, torch.jit.ScriptModule)):
+            module_typenames_to_export_as_functions = _setup_trace_module_map(
+                model, export_modules_as_functions
+            )
+
+        with exporter_context(model, training, verbose):
+            val_keep_init_as_ip = _decide_keep_init_as_input(
+                keep_initializers_as_inputs,
+                operator_export_type,
+                opset_version,
+            )
+            val_add_node_names = _decide_add_node_names(
+                add_node_names, operator_export_type
+            )
+            val_do_constant_folding = _decide_constant_folding(
+                do_constant_folding, operator_export_type, training
+            )
+            # Normally f can be a file-like object, but for large models, the external data format requires a
+            # valid `model_file_location`. Code in export.cpp will enforce this.
+            if isinstance(f, str):
+                model_file_location = f
+            else:
+                model_file_location = ""
+            args = _decide_input_format(model, args)
+            if dynamic_axes is None:
+                dynamic_axes = {}
+            _validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
+
+            graph, params_dict, torch_out = _model_to_graph(
+                model,
+                args,
+                verbose,
+                input_names,
+                output_names,
+                operator_export_type,
+                val_do_constant_folding,
+                fixed_batch_size=fixed_batch_size,
+                training=training,
+                dynamic_axes=dynamic_axes,
+            )
+
+            # TODO: Don't allocate a in-memory string for the protobuf
+            defer_weight_export = (
+                export_type is not _exporter_states.ExportTypes.PROTOBUF_FILE
+            )
+            if custom_opsets is None:
+                custom_opsets = {}
+
+            _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+            node_attr_to_name = {}  # type: ignore[var-annotated]
+            if module_typenames_to_export_as_functions:
+                # NOTE: cannot call DCE after this pass. DCE will remove function definition nodes.
+                node_attr_to_name = _C._jit_pass_onnx_function_extraction(
+                    graph,
+                    module_typenames_to_export_as_functions,
+                    list(params_dict.keys()),
+                )
+
+            if keep_initializers_as_inputs is not True:
+                params_dict = _C._jit_pass_onnx_deduplicate_initializers(  # type: ignore[assignment]
+                    graph,
+                    params_dict,  # type: ignore[arg-type]
+                    getattr(model, "training", False),  # type: ignore[arg-type]
+                )
+            _C._jit_pass_onnx_assign_scoped_names_for_node_and_value(graph)
+            if export_params:
+                (
+                    proto,
+                    export_map,
+                    val_use_external_data_format,
+                    node_names,
+                ) = graph._export_onnx(  # type: ignore[attr-defined]
+                    params_dict,
+                    opset_version,
+                    dynamic_axes,
+                    defer_weight_export,
+                    operator_export_type,
+                    not verbose,
+                    val_keep_init_as_ip,
+                    custom_opsets,
+                    val_add_node_names,
+                    model_file_location,
+                    node_attr_to_name,
+                )
+            else:
+                (
+                    proto,
+                    export_map,
+                    val_use_external_data_format,
+                    node_names,
+                ) = graph._export_onnx(  # type: ignore[attr-defined]
+                    {},
+                    opset_version,
+                    dynamic_axes,
+                    False,
+                    operator_export_type,
+                    not verbose,
+                    val_keep_init_as_ip,
+                    custom_opsets,
+                    val_add_node_names,
+                    model_file_location,
+                    node_attr_to_name,
+                )
+            # insert function_proto into model_proto.
+            proto = onnx_proto_utils._add_onnxscript_fn(
+                proto,
+                custom_opsets,
+            )
+            if verbose:
+                torch.onnx.log("Exported graph: ", graph)
+            onnx_proto_utils._export_file(proto, f, export_type, export_map)
+    finally:
+        assert GLOBALS.in_onnx_export
+        GLOBALS.in_onnx_export = False
+        GLOBALS.autograd_inlining = _autograd_inlining_previous
+        _reset_trace_module_map()
+
+    return torch_out
+
+
+def _apply_friendly_debug_names(graph, params):
+    for n in graph.nodes():
+        for v in n.inputs():
+            old_name = v.debugName()
+            if old_name != str(v.unique()):
+                continue
+            new_name = f"{n.kind()}_{v.unique()}"
+            v.setDebugName(new_name)
+            if old_name in params:
+                params[new_name] = params.pop(old_name)
+
+
+def _set_input_and_output_names(graph, input_names, output_names):
+    def set_names(node_list, name_list, descriptor):
+        if name_list is None:
+            return
+        if len(name_list) > len(node_list):
+            raise RuntimeError(
+                "number of %s names provided (%d) exceeded number of %ss (%d)"
+                % (descriptor, len(name_list), descriptor, len(node_list))
+            )
+
+        # Mark if the output node DebugName is set before.
+        output_node_set = set()
+        for i, (name, node) in enumerate(zip(name_list, node_list)):
+            # Duplicated output node, insert onnx::Identity to avoid setting the same DebugName after setDebugName().
+            if descriptor == "output":
+                if node in output_node_set:
+                    identity_node = graph.create("onnx::Identity")
+                    identity_node.insertAfter(node.node())
+                    identity_node.addInput(node)
+                    identity_node.output().setType(node.type())
+                    graph.return_node().replaceInput(i, identity_node.output())
+                    node = identity_node.output()
+                output_node_set.add(node)
+
+            if node.debugName() != name:
+                node.setDebugName(name)
+
+    set_names(list(graph.inputs()), input_names, "input")
+    set_names(list(graph.outputs()), output_names, "output")
+
+
+def _run_symbolic_method(g, op_name, symbolic_fn, args):
+    r"""
+    This trampoline function gets invoked for every symbolic method
+    call from C++.
+    """
+    try:
+        graph_context = jit_utils.GraphContext(
+            graph=g,
+            block=g.block(),
+            opset=GLOBALS.export_onnx_opset_version,
+            original_node=None,  # type: ignore[arg-type]
+            params_dict=_params_dict,
+            env={},
+            values_in_env=set(),
+            new_nodes=[],
+        )
+        return symbolic_fn(graph_context, *args)
+    except TypeError as e:
+        # Handle the specific case where we didn't successfully dispatch
+        # to symbolic_fn.  Otherwise, the backtrace will have the clues
+        # you need.
+        e.args = (f"{e.args[0]} (occurred when translating {op_name})",)
+        raise
+
+
+def _add_block(node: _C.Node) -> _C.Block:
+    return node.addBlock()
+
+
+def _add_input_to_block(block: _C.Block):
+    return block.addInputToBlock()  # type: ignore[attr-defined]
+
+
+def _add_output_to_block(block: _C.Block, value: _C.Value) -> int:
+    return block.registerOutput(value)
+
+
+def _should_aten_fallback(
+    name: str, opset_version: int, operator_export_type: _C_onnx.OperatorExportTypes
+):
+    # For all builds, if domain=="aten" and operator_export_type==ONNX_ATEN,
+    #   an aten::ATen operator is created regardless of symbolics existence
+
+    is_exportable_aten_op = registration.registry.is_registered_op(name, opset_version)
+    is_onnx_aten_export = operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN
+    is_aten_fallback_export = (
+        operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK
+    )
+
+    if not name.startswith("aten::"):
+        return False
+
+    if is_onnx_aten_export or (is_aten_fallback_export and not is_exportable_aten_op):
+        return True
+
+    return False
+
+
+def _get_aten_op_overload_name(n: _C.Node) -> str:
+    # Returns `overload_name` attribute to ATen ops on non-Caffe2 builds
+    schema = n.schema()
+    if not schema.startswith("aten::"):
+        return ""
+    return _C.parse_schema(schema).overload_name
+
+
+def _run_symbolic_function(
+    graph: _C.Graph,
+    block: _C.Block,
+    node: _C.Node,
+    inputs: Any,
+    env: dict[_C.Value, _C.Value],
+    values_in_env: set[_C.Value],
+    new_nodes: list[_C.Node],
+    operator_export_type=_C_onnx.OperatorExportTypes.ONNX,
+) -> _C.Value | Sequence[_C.Value | None] | None:
+    """Runs a symbolic function.
+
+    The function is used in C++ to export the node to ONNX.
+
+    Returns:
+        A single or a tuple of Values.
+        None when the node gets cloned as is into the new graph.
+    """
+
+    opset_version = GLOBALS.export_onnx_opset_version
+
+    # See Note [Export inplace]
+    node_kind = node.kind()
+    if node_kind.endswith("_"):
+        # Treat relu_ -> relu; add_ -> add etc.
+        ns_op_name = node_kind[:-1]
+    else:
+        ns_op_name = node_kind
+
+    namespace, op_name = jit_utils.parse_node_kind(ns_op_name)
+
+    graph_context = jit_utils.GraphContext(
+        graph=graph,
+        block=block,
+        opset=opset_version,
+        original_node=node,
+        params_dict=_params_dict,
+        env=env,
+        values_in_env=values_in_env,
+        new_nodes=new_nodes,
+    )
+
+    # Direct ATen export requested
+    if _should_aten_fallback(ns_op_name, opset_version, operator_export_type):
+        attrs = {
+            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+            for k in node.attributeNames()
+        }
+        outputs = node.outputsSize()
+        attrs["outputs"] = outputs
+        return graph_context.aten_op(
+            op_name,
+            *inputs,
+            overload_name=_get_aten_op_overload_name(node),
+            **attrs,
+        )
+
+    try:
+        domain = namespace
+        symbolic_function_name = f"{domain}::{op_name}"
+
+        symbolic_function_group = registration.registry.get_function_group(
+            symbolic_function_name
+        )
+        if symbolic_function_group is not None:
+            symbolic_fn = symbolic_function_group.get(opset_version)
+            if symbolic_fn is not None:
+                # TODO Wrap almost identical attrs assignment or comment the difference.
+                attrs = {
+                    k: symbolic_helper._node_get(node, k) for k in node.attributeNames()
+                }
+                return symbolic_fn(graph_context, *inputs, **attrs)
+
+        attrs = {
+            k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+            for k in node.attributeNames()
+        }
+        if namespace == "onnx":
+            # Clone node to trigger ONNX shape inference
+            return graph_context.op(
+                op_name, *inputs, **attrs, outputs=node.outputsSize()
+            )  # type: ignore[attr-defined]
+
+        raise errors.UnsupportedOperatorError(
+            symbolic_function_name,
+            opset_version,
+            symbolic_function_group.get_min_supported()
+            if symbolic_function_group
+            else None,
+        )
+
+    except RuntimeError:
+        if operator_export_type == _C_onnx.OperatorExportTypes.ONNX_FALLTHROUGH:
+            return None
+        elif operator_export_type == _C_onnx.OperatorExportTypes.ONNX_ATEN_FALLBACK:
+            # Emit ATen op for non-Caffe2 builds when `operator_export_type==ONNX_ATEN_FALLBACK`
+            attrs = {
+                k + "_" + node.kindOf(k)[0]: symbolic_helper._node_get(node, k)
+                for k in node.attributeNames()
+            }
+            return graph_context.aten_op(
+                op_name,
+                *inputs,
+                overload_name=_get_aten_op_overload_name(node),
+                **attrs,
+            )
+        raise
+    except TypeError as e:
+        # Handle the specific case where we didn't successfully dispatch.
+        # Otherwise, the backtrace will have the clues you need.
+        e.args = (f"{e.args[0]} \n(Occurred when translating {op_name}).",)
+        raise
+
+
+def _verify_custom_op_name(symbolic_name: str):
+    if not re.match(r"^[a-zA-Z0-9-_]+::[a-zA-Z-_]+[a-zA-Z0-9-_]*$", symbolic_name):
+        raise errors.OnnxExporterError(
+            f"Failed to register operator {symbolic_name}. "
+            "The symbolic name must match the format domain::name, "
+            "and should start with a letter and contain only "
+            "alphanumerical characters"
+        )
+
+    ns, _ = jit_utils.parse_node_kind(symbolic_name)
+    if ns == "onnx":
+        raise ValueError(
+            f"Failed to register operator {symbolic_name}. {ns} domain cannot be modified."
+        )
+
+
+def register_custom_op_symbolic(
+    symbolic_name: str,
+    symbolic_fn: Callable,
+    opset_version: int,
+):
+    """Registers a symbolic function for a custom operator.
+
+    When the user registers symbolic for custom/contrib ops,
+    it is highly recommended to add shape inference for that operator via setType API,
+    otherwise the exported graph may have incorrect shape inference in some extreme cases.
+    An example of setType is `test_aten_embedding_2` in `test_operators.py`.
+
+    See "Custom Operators" in the module documentation for an example usage.
+
+    Args:
+        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
+            format.
+        symbolic_fn (Callable): A function that takes in the ONNX graph and
+            the input arguments to the current operator, and returns new
+            operator nodes to add to the graph.
+        opset_version (int): The ONNX opset version in which to register.
+    """
+    if symbolic_name.startswith("::"):
+        symbolic_name = f"aten{symbolic_name}"
+
+    _verify_custom_op_name(symbolic_name)
+
+    registration.custom_onnx_symbolic(symbolic_name, opset_version)(symbolic_fn)
+
+
+def unregister_custom_op_symbolic(symbolic_name: str, opset_version: int):
+    """Unregisters ``symbolic_name``.
+
+    See "Custom Operators" in the module documentation for an example usage.
+
+    Args:
+        symbolic_name (str): The name of the custom operator in "<domain>::<op>"
+            format.
+        opset_version (int): The ONNX opset version in which to unregister.
+    """
+    if symbolic_name.startswith("::"):
+        symbolic_name = f"aten{symbolic_name}"
+
+    _verify_custom_op_name(symbolic_name)
+
+    registration.registry.unregister(symbolic_name, opset_version)
+
+
+def _validate_dynamic_axes(dynamic_axes, model, input_names, output_names):
+    """Ensures dynamic axes argument is follows the expected format."""
+    if len(dynamic_axes) == 0:
+        return
+
+    if hasattr(model, "graph"):
+        # Extracting set of valid input/output names that shall be used for dynamic_axes
+        if (input_names is None) or len(input_names) == 0:
+            input_names = [x.debugName() for x in model.graph.inputs()]
+        if (output_names is None) or len(output_names) == 0:
+            output_names = [y.debugName() for y in model.graph.outputs()]
+
+    valid_names = set((input_names or []) + (output_names or []))
+
+    # If dynamic axes are provided as a list rather than dictionary, they should
+    # first get converted to a dictionary in expected format. If desired axes names
+    # are not provided for dynamic axes, automatic names shall be generated for
+    # provided dynamic axes of specified input/output
+    for key, value in dynamic_axes.items():
+        if key not in valid_names:
+            warnings.warn(
+                f"Provided key {key} for dynamic axes is not a valid input/output name"
+            )
+        if isinstance(value, list):
+            warnings.warn(
+                "No names were found for specified dynamic axes of provided input."
+                f"Automatically generated names will be applied to each dynamic axes of input {key}"
+            )
+
+            value_dict = {}
+            for i, x in enumerate(value):
+                if not isinstance(x, int):
+                    raise ValueError(
+                        "The type of axis index is expected to be an integer"
+                    )
+                if x in value_dict:
+                    warnings.warn(
+                        f"Duplicate dynamic axis index {x} was provided for input {key}."
+                    )
+                else:
+                    value_dict[x] = str(key) + "_dynamic_axes_" + str(i + 1)
+            dynamic_axes[key] = value_dict
+
+
+def model_signature(model: torch.nn.Module | Callable) -> inspect.Signature:
+    return inspect.signature(
+        model.forward if isinstance(model, torch.nn.Module) else model
+    )
diff --git a/.venv/Lib/site-packages/torch/onnx/verification.py b/.venv/Lib/site-packages/torch/onnx/verification.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4950d72850f9a21215bbd9a0aaee482942c025c
--- /dev/null
+++ b/.venv/Lib/site-packages/torch/onnx/verification.py
@@ -0,0 +1,1806 @@
+# mypy: allow-untyped-defs
+"""Functions to verify exported ONNX model is functionally equivalent to original PyTorch model.
+
+ONNX Runtime is required, and is used as the ONNX backend for export verification.
+"""
+
+from __future__ import annotations
+
+import contextlib
+import copy
+import dataclasses
+import datetime
+import difflib
+import enum
+import functools
+import io
+import itertools
+import os
+import tempfile
+import warnings
+from typing import Any, Callable, Collection, Mapping, Sequence, Tuple, Union
+
+import numpy as np
+
+import torch
+import torch._C._onnx as _C_onnx
+from torch import _C
+from torch.onnx import _constants, _experimental, _exporter_states, utils
+from torch.onnx._globals import GLOBALS
+from torch.onnx._internal import onnx_proto_utils
+from torch.types import Number
+
+
+_ORT_PROVIDERS = ("CPUExecutionProvider",)
+
+_NumericType = Union[Number, torch.Tensor, np.ndarray]
+_ModelType = Union[torch.nn.Module, torch.jit.ScriptModule]
+_InputArgsType = Union[torch.Tensor, Tuple[Any, ...]]
+_InputKwargsType = Mapping[str, Any]
+_OutputsType = Union[Sequence[_NumericType], Sequence]
+
+
+class OnnxBackend(enum.Enum):
+    """Enum class for ONNX backend used for export verification."""
+
+    REFERENCE = "ONNXReferenceEvaluator"
+    ONNX_RUNTIME_CPU = "CPUExecutionProvider"
+    ONNX_RUNTIME_CUDA = "CUDAExecutionProvider"
+
+
+@dataclasses.dataclass
+class VerificationOptions:
+    """Options for ONNX export verification.
+
+    Attributes:
+        flatten: If True, unpack nested list/tuple/dict inputs into a flattened list of
+            Tensors for ONNX. Set this to False if nested structures are to be preserved
+            for ONNX, which is usually the case with exporting ScriptModules. Default True.
+        ignore_none: Whether to ignore None type in torch output, which is usually the
+            case with tracing. Set this to False, if torch output should keep None type,
+            which is usually the case with exporting ScriptModules. Default to True.
+        check_shape: Whether to check the shapes between PyTorch and ONNX Runtime outputs
+            are exactly the same. Set this to False to allow output shape broadcasting.
+            Default to True.
+        check_dtype: Whether to check the dtypes between PyTorch and ONNX Runtime outputs
+            are consistent. Default to True.
+        backend: ONNX backend for verification. Default to OnnxBackend.ONNX_RUNTIME_CPU.
+        rtol: relative tolerance in comparison between ONNX and PyTorch outputs.
+        atol: absolute tolerance in comparison between ONNX and PyTorch outputs.
+        remained_onnx_input_idx: If provided, only the specified inputs will be passed
+            to the ONNX model. Supply a list when there are unused inputs in the model.
+            Since unused inputs will be removed in the exported ONNX model, supplying
+            all inputs will cause an error on unexpected inputs. This parameter tells
+            the verifier which inputs to pass into the ONNX model.
+        acceptable_error_percentage: acceptable percentage of element mismatches in comparison.
+            It should be a float of value between 0.0 and 1.0.
+    """
+
+    flatten: bool = True
+    ignore_none: bool = True
+    check_shape: bool = True
+    check_dtype: bool = True
+    backend: OnnxBackend = OnnxBackend.ONNX_RUNTIME_CPU
+    rtol: float = 1e-3
+    atol: float = 1e-7
+    remained_onnx_input_idx: Sequence[int] | None = None
+    acceptable_error_percentage: float | None = None
+
+
+def _flatten_tuples(elem):
+    flattened = []
+    for t in elem:
+        if isinstance(t, tuple):
+            flattened.extend(_flatten_tuples(t))
+        else:
+            flattened.append(t)
+    return flattened
+
+
+# TODO(justinchuby): Add type checking by narrowing down the return type when input is None
+def _to_numpy(elem) -> list | np.ndarray:
+    if isinstance(elem, torch.Tensor):
+        if elem.requires_grad:
+            return elem.detach().cpu().numpy()
+        else:
+            return elem.cpu().numpy()
+    elif isinstance(elem, (list, tuple)):
+        return [_to_numpy(inp) for inp in elem]
+    elif isinstance(elem, (bool, int, float)):
+        return np.array(elem)
+    elif isinstance(elem, dict):
+        flattened = []
+        for k in elem:
+            flattened.extend([_to_numpy(k), _to_numpy(elem[k])])
+        return flattened
+    return elem
+
+
+def _inline_flatten_list(inputs, res_list) -> list:
+    for i in inputs:
+        res_list.append(i) if not isinstance(
+            i, (list, tuple)
+        ) else _inline_flatten_list(i, res_list)
+    return res_list
+
+
+def _unpack_to_numpy(values, cast_onnx_accepted=True) -> list:
+    value_unpacked = []
+    for value in values:
+        value_unpacked.extend(
+            utils.unpack_quantized_tensor(value, cast_onnx_accepted=cast_onnx_accepted)
+        )
+    return [_to_numpy(v) for v in value_unpacked]
+
+
+def _run_onnx(onnx_session, inputs) -> _OutputsType:
+    kw_inputs = {}
+    if inputs and isinstance(inputs[-1], dict):
+        kw_inputs = inputs[-1]
+        inputs = inputs[:-1]
+    inputs = _unpack_to_numpy(_flatten_tuples(inputs))
+    ort_inputs = {}
+    for input_name, input in kw_inputs.items():
+        ort_inputs[input_name] = _to_numpy(input)
+    inputs = _to_numpy(inputs)
+    if hasattr(onnx_session, "get_inputs"):
+        # onnxruntime.InferenceSession
+        input_names = [i.name for i in onnx_session.get_inputs()]
+    elif hasattr(onnx_session, "input_names"):
+        # onnx.reference.ReferenceEvaluator
+        input_names = onnx_session.input_names
+    else:
+        raise ValueError(f"Unknown ONNX backend type: {type(onnx_session)}.")
+
+    for i, input in enumerate(inputs):
+        if i == len(input_names) or input_names[i] in ort_inputs:
+            raise ValueError(
+                f"got too many positional inputs. inputs: {inputs}. kw_inputs: {kw_inputs}. "
+                f"input names: {input_names}."
+            )
+        ort_inputs[input_names[i]] = input
+    onnx_outs = onnx_session.run(None, ort_inputs)
+    return onnx_outs
+
+
+def _ort_session(
+    model: str | io.BytesIO, ort_providers: Sequence[str] = _ORT_PROVIDERS
+):
+    try:
+        import onnxruntime  # type: ignore[import]
+    except ImportError as e:
+        raise ImportError("onnxruntime is required for export verification.") from e
+
+    if ort_providers is None:
+        ort_providers = _ORT_PROVIDERS
+
+    session_options = onnxruntime.SessionOptions()
+    # suppress ort warnings.
+    # 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
+    session_options.log_severity_level = 3
+    ort_session = onnxruntime.InferenceSession(
+        model if isinstance(model, str) else model.getvalue(),
+        session_options,
+        providers=ort_providers,
+    )
+    return ort_session
+
+
+def _onnx_reference_evaluator_session(model: str | io.BytesIO):
+    try:
+        import onnx
+        from onnx import reference as onnx_reference  # type: ignore[attr-defined]
+    except ImportError as exc:
+        raise ImportError("onnx >= 1.13 is required for reference evaluator.") from exc
+
+    proto = (
+        onnx.load(model)  # type: ignore[attr-defined]
+        if isinstance(model, str)
+        else onnx.load_model_from_string(model.getvalue())  # type: ignore[attr-defined]
+    )
+    onnx_session = onnx_reference.ReferenceEvaluator(proto)
+    return onnx_session
+
+
+def _onnx_backend_session(model: str | io.BytesIO, backend: OnnxBackend):
+    if backend == OnnxBackend.REFERENCE:
+        onnx_session = _onnx_reference_evaluator_session(model)
+    elif backend in {OnnxBackend.ONNX_RUNTIME_CPU, OnnxBackend.ONNX_RUNTIME_CUDA}:
+        onnx_session = _ort_session(model, (backend.value,))
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
+    return onnx_session
+
+
+def _compare_onnx_pytorch_outputs_in_np(
+    onnx_outs: _OutputsType,
+    pt_outs: _OutputsType,
+    options: VerificationOptions,
+):
+    assert (
+        len(onnx_outs) == len(pt_outs)
+    ), f"Number of outputs differ ONNX runtime: ({len(onnx_outs)}) PyTorch: ({len(pt_outs)})"
+    acceptable_error_percentage = options.acceptable_error_percentage
+    if acceptable_error_percentage and (
+        acceptable_error_percentage > 1.0 or acceptable_error_percentage < 0.0
+    ):
+        raise ValueError(
+            "If set, acceptable_error_percentage should be between 0.0 and 1.0"
+        )
+
+    for ort_out, pt_out in zip(onnx_outs, pt_outs):
+        try:
+            # TODO: Remove `check_shape` option once every shape inconsistent issue is addressed.
+            if not options.check_shape:
+                # Allow different but broadcastable output shapes.
+                ort_out, pt_out = np.broadcast_arrays(ort_out, pt_out)
+            torch.testing.assert_close(
+                ort_out,
+                pt_out,
+                rtol=options.rtol,
+                atol=options.atol,
+                check_dtype=options.check_dtype,
+                equal_nan=True,
+            )
+        except AssertionError as e:
+            if acceptable_error_percentage:
+                error_percentage = 1 - np.sum(
+                    np.isclose(ort_out, pt_out, rtol=options.rtol, atol=options.atol)
+                ) / np.prod(ort_out.shape)
+                if error_percentage <= acceptable_error_percentage:
+                    warnings.warn(
+                        f"Suppressed AssertionError:\n{e}.\n"
+                        f"Error percentage {error_percentage} "
+                        f"within acceptable range {acceptable_error_percentage}."
+                    )
+                    continue
+            if ort_out.dtype == np.uint8 or ort_out.dtype == np.int8:
+                warnings.warn("ONNX output is quantized")
+            if pt_out.dtype == np.uint8 or pt_out.dtype == np.int8:
+                warnings.warn("PyTorch output is quantized")
+            raise
+
+
+def _compare_onnx_pytorch_outputs(
+    onnx_outs: _OutputsType,
+    pt_outs: Any,
+    options: VerificationOptions,
+):
+    """
+    Compare ONNX and PyTorch outputs.
+
+    Args:
+        onnx_outs: outputs from ONNX backend.
+        pt_outs: outputs from PyTorch.
+        options: options for verification.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+        ValueError: if arguments provided are invalid.
+    """
+    if options.ignore_none:
+        # torch.jit._flatten filters None type
+        pt_outs, _ = torch.jit._flatten(pt_outs)
+    else:
+        pt_outs = _inline_flatten_list([pt_outs], [])
+    pt_outs_np = _unpack_to_numpy(pt_outs, cast_onnx_accepted=False)
+    onnx_outs = _inline_flatten_list(onnx_outs, [])
+    _compare_onnx_pytorch_outputs_in_np(onnx_outs, pt_outs_np, options)
+
+
+def _prepare_input_for_pytorch(args, kwargs):
+    """Prepare input for PyTorch model execution.
+
+    Any future changes/formatting to the input before dispatching to the PyTorch
+    model should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+
+    Returns:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+    """
+    if isinstance(args, (torch.Tensor, dict)):
+        args = (args,)
+    # In-place operators will update input tensor data as well.
+    # Thus inputs are replicated before every forward call.
+    args = copy.deepcopy(args)
+    if kwargs:
+        kwargs = copy.deepcopy(kwargs)
+    else:
+        kwargs = {}
+    return args, kwargs
+
+
+def _prepare_input_for_export(args, kwargs):
+    """Prepare input for ONNX model export.
+
+    Any future changes/formatting to the input before dispatching to the
+    :func:`torch.onnx.export` api should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+
+    Returns:
+        onnx_inputs: positional arguments for ONNX model export, as `args` in
+            :func:`torch.onnx.export`.
+    """
+    args, kwargs = _prepare_input_for_pytorch(args, kwargs)
+    if not kwargs and len(args) > 0 and isinstance(args[-1], dict):
+        onnx_inputs = args + ({},)
+    elif kwargs:
+        onnx_inputs = args + (kwargs,)
+    else:
+        onnx_inputs = args
+    return onnx_inputs
+
+
+def _prepare_input_for_onnx(
+    args, kwargs, remained_onnx_input_idx: Sequence[int] | None, flatten: bool
+):
+    """Prepare input for ONNX model execution in ONNX backend.
+
+    Any future changes/formatting to the input before dispatching to the ONNX backend
+    run should be made in this function.
+
+    Args:
+        args: positional arguments for PyTorch model forward method.
+        kwargs: keyword arguments for PyTorch model forward method.
+        remained_onnx_input_idx: indices of inputs to be used for ONNX model execution.
+        flatten: whether to flatten the input before dispatching to the ONNX model execution.
+
+    Returns:
+        onnx_inputs: positional arguments for ONNX model execution in ONNX backend.
+    """
+    onnx_inputs = _prepare_input_for_export(args, kwargs)
+    if flatten:
+        onnx_inputs, _ = torch.jit._flatten(onnx_inputs)
+    elif onnx_inputs and onnx_inputs[-1] == {}:
+        # Handle empty kwargs (normally removed by flatten).
+        onnx_inputs = onnx_inputs[:-1]
+    if remained_onnx_input_idx is not None:
+        return [onnx_inputs[i] for i in remained_onnx_input_idx]
+    else:
+        return onnx_inputs
+
+
+def _try_clone_model(model):
+    """Used for preserving original model in case forward mutates model states."""
+    try:
+        return copy.deepcopy(model)
+    except Exception:
+        warnings.warn(
+            "Failed to clone model. Model state might be mutated during verification."
+        )
+        return model
+
+
+def _compare_onnx_pytorch_model(
+    pt_model: _ModelType,
+    onnx_model_f: str | io.BytesIO,
+    input_args: _InputArgsType,
+    input_kwargs: _InputKwargsType | None,
+    additional_test_inputs: Sequence[_InputArgsType] | None,
+    options: VerificationOptions,
+):
+    """Compare outputs from ONNX model runs with outputs from PyTorch model runs.
+
+    Args:
+        pt_model: PyTorch model.
+        onnx_model_f: ONNX model file path or file-like object.
+        input_args: positional arguments for PyTorch model forward method.
+        input_kwargs: keyword arguments for PyTorch model forward method.
+        additional_test_inputs: additional positional arguments for PyTorch model
+            forward method.
+        options: options for verification.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+    """
+    onnx_session = _onnx_backend_session(onnx_model_f, options.backend)
+
+    def compare_onnx_pytorch_model_with_input(input_args, input_kwargs):
+        pt_args, pt_kwargs = _prepare_input_for_pytorch(input_args, input_kwargs)
+        # TODO: remove this and treat mutating model separately. See #77679
+        pt_model_copy = _try_clone_model(pt_model)
+        pt_outs = pt_model_copy(*pt_args, **pt_kwargs)
+
+        onnx_inputs = _prepare_input_for_onnx(
+            input_args, input_kwargs, options.remained_onnx_input_idx, options.flatten
+        )
+
+        onnx_outs = _run_onnx(onnx_session, onnx_inputs)
+
+        _compare_onnx_pytorch_outputs(
+            onnx_outs=onnx_outs,
+            pt_outs=pt_outs,
+            options=options,
+        )
+
+    compare_onnx_pytorch_model_with_input(input_args, input_kwargs)
+
+    if additional_test_inputs:
+        for test_input_args in additional_test_inputs:
+            compare_onnx_pytorch_model_with_input(test_input_args, {})
+
+
+class _GraphDiff:
+    """A class to represent the difference between two graphs."""
+
+    def __init__(self, graph_a: _C.Graph, graph_b: _C.Graph):
+        """Construct a _GraphDiff object.
+
+        Args:
+            graph_a (_C.Graph): First graph to compare.
+            graph_b (_C.Graph): Second graph to compare.
+        """
+        self.graph_a = graph_a
+        self.graph_b = graph_b
+
+    def __str__(self):
+        """See function :func:`diff_report`."""
+        return self.diff_report()
+
+    def _indent(self, lines: str) -> str:
+        return "\n".join(["\t" + line for line in lines.splitlines()])
+
+    def diff_report(self) -> str:
+        """Return a string representation of the graph difference.
+
+        The report shows the first pair of nodes that diverges. It also shows the source
+        location of the pair of nodes.
+
+        Returns:
+            graph_diff_report (str): A string representation of the graph difference.
+        """
+        graph_a = self.graph_a
+        graph_b = self.graph_b
+
+        graph_a_str = str(graph_a)
+        graph_b_str = str(graph_b)
+
+        if graph_a_str == graph_b_str:
+            return ""
+
+        graph_diff = difflib.ndiff(
+            graph_a_str.splitlines(True), graph_b_str.splitlines(True)
+        )
+        graph_diff_report = ["Graph diff:", self._indent("".join(graph_diff))]
+
+        for node_a, node_b in itertools.zip_longest(graph_a.nodes(), graph_b.nodes()):
+            if str(node_a) != str(node_b):
+                graph_diff_report.append("First diverging operator:")
+                node_diff = difflib.ndiff(
+                    str(node_a).splitlines(True), str(node_b).splitlines(True)
+                )
+                source_printout = ["node diff:", self._indent("".join(node_diff))]
+
+                stack_a = node_a.sourceRange() if node_a else None
+                if stack_a:
+                    source_printout.extend(
+                        ["Former source location:", self._indent(str(stack_a))]
+                    )
+                stack_b = node_b.sourceRange() if node_b else None
+                if stack_b:
+                    source_printout.extend(
+                        ["Latter source location:", self._indent(str(stack_b))]
+                    )
+
+                graph_diff_report.extend(source_printout)
+
+                break
+
+        return "\n".join(graph_diff_report)
+
+
+def _check_graph_diff(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    test_input_groups: Sequence[tuple[tuple[Any, ...], Mapping[str, Any]]],
+    export_options: _experimental.ExportOptions,
+    model_to_graph_func: Callable[
+        [
+            torch.nn.Module,
+            tuple[Any, ...],
+            Mapping[str, Any],
+            _experimental.ExportOptions,
+        ],
+        _C.Graph,
+    ],
+) -> str:
+    """Check if graph produced by `model_to_graph_func` is the same across `test_input_groups`.
+
+    Args:
+        model: See :func:`check_export_model_diff`.
+        test_input_groups: See :func:`check_export_model_diff`.
+        export_options: See :func:`check_export_model_diff`.
+        model_to_graph_func: A function to convert a PyTorch model to a JIT IR graph.
+
+    Returns:
+        graph_diff_report (str): A string representation of the graph difference.
+    """
+    if len(test_input_groups) < 2:
+        raise ValueError("Need at least two groups of test inputs to compare.")
+
+    ref_jit_graph = None
+    for args, kwargs in test_input_groups:
+        jit_graph = model_to_graph_func(model, args, kwargs, export_options)
+        if ref_jit_graph is None:
+            ref_jit_graph = jit_graph
+            continue
+
+        graph_diff_report = _GraphDiff(ref_jit_graph, jit_graph).diff_report()
+        if graph_diff_report:
+            return graph_diff_report
+    return ""
+
+
+def _traced_graph_from_model(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    args: tuple[Any, ...],
+    kwargs: Mapping[str, Any],
+    export_options: _experimental.ExportOptions,
+) -> _C.Graph:
+    """As part of the ONNX export steps, create a traced JIT graph from a PyTorch model.
+
+    Args:
+        model: See :func:`check_export_model_diff`.
+        args: See :func:`check_export_model_diff`.
+        kwargs: See :func:`check_export_model_diff`.
+        export_options: See :func:`check_export_model_diff`.
+
+    Returns:
+        jit_graph (_C.Graph): A traced JIT graph.
+    """
+    training = export_options.training
+    verbose = export_options.verbose
+
+    with utils.exporter_context(model, training, verbose):
+        export_inputs = _prepare_input_for_export(args, kwargs)
+        model = utils._pre_trace_quant_model(model, export_inputs)
+        jit_graph, _, _, _ = utils._create_jit_graph(model, export_inputs)
+        return jit_graph
+
+
+def _onnx_graph_from_model(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    args: tuple[Any, ...],
+    kwargs: Mapping[str, Any],
+    export_options: _experimental.ExportOptions,
+) -> _C.Graph:
+    """As part of the ONNX export steps, export an ONNX JIT graph from a PyTorch model.
+
+    Args:
+        model: See :func:`check_export_model_diff`.
+        args: See :func:`check_export_model_diff`.
+        kwargs: See :func:`check_export_model_diff`.
+        export_options: See :func:`check_export_model_diff`.
+
+    Returns:
+        onnx_graph (_C.Graph): An ONNX JIT graph.
+    """
+    # TODO: refactor utils.py to remove duplicated code of context setup. See #78834
+    opset_version = export_options.opset_version
+    operator_export_type = export_options.operator_export_type
+    export_modules_as_functions = export_options.export_modules_as_functions
+    training = export_options.training
+    verbose = export_options.verbose
+    dynamic_axes = export_options.dynamic_axes
+    input_names = export_options.input_names
+    output_names = export_options.output_names
+
+    if opset_version is None:
+        opset_version = _constants.ONNX_DEFAULT_OPSET
+
+    utils._setup_trace_module_map(model, export_modules_as_functions)
+
+    if not operator_export_type:
+        operator_export_type = _C_onnx.OperatorExportTypes.ONNX
+
+    GLOBALS.export_onnx_opset_version = opset_version
+    GLOBALS.operator_export_type = operator_export_type
+
+    with utils.exporter_context(model, training, verbose):
+        do_constant_folding = utils._decide_constant_folding(
+            export_options.do_constant_folding, operator_export_type, training
+        )
+
+        if dynamic_axes is None:
+            dynamic_axes = {}
+        utils._validate_dynamic_axes(dynamic_axes, model, input_names, output_names)
+
+        export_inputs = _prepare_input_for_export(args, kwargs)
+        export_inputs = utils._decide_input_format(model, export_inputs)
+        onnx_graph, _, _ = utils._model_to_graph(
+            model,
+            export_inputs,
+            verbose,
+            input_names,
+            output_names,
+            operator_export_type,
+            do_constant_folding,
+            training=training,
+            dynamic_axes=dynamic_axes,
+        )
+
+        return onnx_graph
+
+
+def _onnx_graph_from_aten_graph(
+    graph: torch.Graph,
+    export_options: _experimental.ExportOptions,
+    params_dict: dict[str, Any] | None = None,
+) -> tuple[torch.Graph, dict[str, Any]]:
+    if params_dict is None:
+        params_dict = {}
+    operator_export_type = export_options.operator_export_type
+    dynamic_axes = export_options.dynamic_axes or {}
+    input_names = export_options.input_names
+    training = export_options.training
+    do_constant_folding = export_options.do_constant_folding
+    opset_version = export_options.opset_version or _constants.ONNX_DEFAULT_OPSET
+
+    GLOBALS.export_onnx_opset_version = opset_version
+    GLOBALS.operator_export_type = operator_export_type
+
+    do_constant_folding = utils._decide_constant_folding(
+        do_constant_folding, operator_export_type, training
+    )
+
+    # TODO: Below is doing aten graph to onnx. It should be abstracted as a
+    # function in torch/onnx/utils.py.
+    graph = graph.copy()
+    graph = utils._optimize_graph(
+        graph,
+        operator_export_type,
+        params_dict=params_dict,
+        dynamic_axes=dynamic_axes,
+        input_names=input_names,
+    )
+
+    if training is None or training == _C_onnx.TrainingMode.EVAL:
+        params_dict = torch._C._jit_pass_onnx_eval_peephole(graph, params_dict)
+
+    if (
+        do_constant_folding
+        and opset_version >= _constants.ONNX_CONSTANT_FOLDING_MIN_OPSET
+    ):
+        params_dict = _C._jit_pass_onnx_constant_fold(graph, params_dict, opset_version)
+        _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+
+    if GLOBALS.onnx_shape_inference:
+        _C._jit_pass_onnx_graph_shape_type_inference(graph, params_dict, opset_version)
+
+    params_dict = _C._jit_pass_onnx_eliminate_unused_items(graph, params_dict)
+
+    # For ONNX opset < 9, constants only have three data types: float16, float, double.
+    # In this pass transform constants of other data types to float/double + cast operator.
+    if opset_version < 9:
+        _C._jit_pass_onnx_cast_all_constant_to_floating(graph)
+
+    params_dict = _C._jit_pass_filter_non_tensor_arguments(params_dict)
+    _C._jit_decay_packed_param_input_types(graph)
+
+    _C._jit_pass_dce_allow_deleting_nodes_with_side_effects(graph)
+
+    if export_options.verbose:
+        print("ONNX graph: ", graph)
+
+    return graph, params_dict
+
+
+def _onnx_proto_from_onnx_graph(
+    onnx_graph: torch.Graph,
+    export_options: _experimental.ExportOptions,
+    params_dict: dict[str, Any],
+) -> tuple[bytes, Mapping[str, bytes]]:
+    opset_version = export_options.opset_version or _constants.ONNX_DEFAULT_OPSET
+    dynamic_axes = export_options.dynamic_axes or {}
+    operator_export_type = export_options.operator_export_type
+    val_keep_init_as_ip = utils._decide_keep_init_as_input(
+        export_options.keep_initializers_as_inputs,
+        operator_export_type,
+        opset_version,
+    )
+    val_add_node_names = utils._decide_add_node_names(True, operator_export_type)
+    custom_opsets = export_options.custom_opsets or {}
+
+    proto, export_map, _, _ = onnx_graph._export_onnx(  # type: ignore[attr-defined]
+        params_dict,
+        opset_version,
+        dynamic_axes,
+        False,
+        operator_export_type,
+        not export_options.verbose,
+        val_keep_init_as_ip,
+        custom_opsets,
+        val_add_node_names,
+        "",
+        {},
+    )
+
+    return proto, export_map
+
+
+def check_export_model_diff(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    test_input_groups: Sequence[tuple[tuple[Any, ...], Mapping[str, Any]]],
+    export_options: _experimental.ExportOptions | None = None,
+) -> str:
+    """Verify exported model discrepancy between different groups of inputs.
+
+    A graph is exported for each group of inputs. The exported graphs are then compared
+    to each other, and discrepancies of first pair of nodes are reported. This function
+    first checks the jit graph. If no discrepancies were found, it then checks the onnx
+    graph.
+
+    Unless otherwise specified, the jit/ONNX graph is expected to be the same, regardless
+    of the inputs used for exporting. A discrepancy implies the graph exported is
+    not accurate when run on other groups of inputs, which will typically results in
+    runtime errors or mismatching output.
+
+    Args:
+        model (torch.nn.Module or torch.jit.ScriptModule): The model to be exported.
+        test_input_groups (Sequence[Tuple[Tuple[Any, ...], Mapping[str, Any]]]): A sequence
+            of input groups to be used to export the model. Each input group is a pair of
+            (args, kwargs).
+        export_options (_experimental.ExportOptions, optional): An _experimental.ExportOptions
+            object that controls the export behavior.
+
+    Returns:
+        str: A string containing the diff of the exported models.
+    """
+    export_options = (
+        _experimental.ExportOptions() if export_options is None else export_options
+    )
+
+    jit_diff_report = _check_graph_diff(
+        model, test_input_groups, export_options, _traced_graph_from_model
+    )
+    if jit_diff_report:
+        return jit_diff_report
+
+    return _check_graph_diff(
+        model, test_input_groups, export_options, _onnx_graph_from_model
+    )
+
+
+def verify(
+    model: _ModelType,
+    input_args: _InputArgsType,
+    input_kwargs: _InputKwargsType | None = None,
+    do_constant_folding: bool = True,
+    dynamic_axes: Mapping[str, Mapping[int, str] | Mapping[str, Sequence[int]]]
+    | None = None,
+    input_names: Sequence[str] | None = None,
+    output_names: Sequence[str] | None = None,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    opset_version: int | None = None,
+    keep_initializers_as_inputs: bool = True,
+    verbose: bool = False,
+    fixed_batch_size: bool = False,
+    use_external_data: bool = False,
+    additional_test_inputs: Sequence[_InputArgsType] | None = None,
+    options: VerificationOptions | None = None,
+):
+    """Verify model export to ONNX against original PyTorch model.
+
+    Args:
+        model (torch.nn.Module or torch.jit.ScriptModule): See :func:`torch.onnx.export`.
+        input_args (tuple): See :func:`torch.onnx.export`.
+        input_kwargs (dict): See :func:`torch.onnx.export`.
+        do_constant_folding (bool, optional): See :func:`torch.onnx.export`.
+        dynamic_axes (dict, optional): See :func:`torch.onnx.export`.
+        input_names (list, optional): See :func:`torch.onnx.export`.
+        output_names (list, optional): See :func:`torch.onnx.export`.
+        training (torch.onnx.TrainingMode): See :func:`torch.onnx.export`.
+        opset_version (int, optional): See :func:`torch.onnx.export`.
+        keep_initializers_as_inputs (bool, optional): See :func:`torch.onnx.export`.
+        verbose (bool, optional): See :func:`torch.onnx.export`.
+        fixed_batch_size (bool, optional): Legacy argument, used only by rnn test cases.
+        use_external_data (bool, optional): Explicitly specify whether to export the
+            model with external data.
+        additional_test_inputs (list, optional): List of tuples. Each tuple is a group of
+            input arguments to test. Currently only *args are supported.
+        options (_VerificationOptions, optional): A _VerificationOptions object that
+            controls the verification behavior.
+
+    Raises:
+        AssertionError: if outputs from ONNX model and PyTorch model are not
+            equal up to specified precision.
+        ValueError: if arguments provided are invalid.
+    """
+    if options is None:
+        options = VerificationOptions()
+
+    if training == torch.onnx.TrainingMode.TRAINING:
+        model.train()
+    elif training == torch.onnx.TrainingMode.EVAL:
+        model.eval()
+    with torch.no_grad(), contextlib.ExitStack() as stack:
+        model_f: str | io.BytesIO = io.BytesIO()
+        if use_external_data:
+            tmpdir_path = stack.enter_context(tempfile.TemporaryDirectory())
+            model_f = os.path.join(tmpdir_path, "model.onnx")
+
+        inputs_for_export = _prepare_input_for_export(input_args, input_kwargs)
+
+        # TODO(#77679): remove this and treat mutating model separately.
+        model_copy = _try_clone_model(model)
+        utils._export(
+            model,
+            inputs_for_export,
+            model_f,
+            opset_version=opset_version,
+            do_constant_folding=do_constant_folding,
+            keep_initializers_as_inputs=keep_initializers_as_inputs,
+            dynamic_axes=dynamic_axes,
+            input_names=input_names,
+            output_names=output_names,
+            fixed_batch_size=fixed_batch_size,
+            training=training,
+            verbose=verbose,
+        )
+
+        _compare_onnx_pytorch_model(
+            pt_model=model_copy,
+            onnx_model_f=model_f,
+            input_args=input_args,
+            input_kwargs=input_kwargs,
+            additional_test_inputs=additional_test_inputs,
+            options=options,
+        )
+
+
+def verify_aten_graph(
+    graph: torch.Graph,
+    input_args: tuple[Any, ...],
+    export_options: _experimental.ExportOptions,
+    params_dict: dict[str, Any] | None = None,
+    verification_options: VerificationOptions | None = None,
+) -> tuple[AssertionError | None, torch.Graph, _OutputsType, _OutputsType]:
+    if verification_options is None:
+        verification_options = VerificationOptions()
+    if params_dict is None:
+        params_dict = {}
+
+    original_jit_graph = graph
+    graph = graph.copy()
+
+    # Execute aten graph and get reference torch jit outputs.
+    graph_inputs = list(graph.inputs())
+    jit_inputs = tuple([arg for arg in input_args if arg is not None])
+    weights = [params_dict[v.debugName()] for v in graph_inputs[len(jit_inputs) :]]
+    assert all(w is not None for w in weights)
+    # TODO: Only copy the argument if mutation is detected in Graph.
+    jit_inputs = copy.deepcopy(jit_inputs)
+    jit_input_and_parameters = jit_inputs + tuple(weights)
+    jit_outs = torch._C._jit_interpret_graph(graph, jit_input_and_parameters)  # type: ignore[attr-defined]
+    if not isinstance(jit_outs, (list, tuple)):
+        jit_outs = [jit_outs]
+
+    # Convert aten graph to onnx graph.
+    graph, onnx_params_dict = _onnx_graph_from_aten_graph(
+        graph, export_options, params_dict
+    )
+
+    proto, export_map = _onnx_proto_from_onnx_graph(
+        graph, export_options, onnx_params_dict
+    )
+    model_f: str | io.BytesIO = io.BytesIO()
+    export_type = _exporter_states.ExportTypes.PROTOBUF_FILE
+    onnx_proto_utils._export_file(proto, model_f, export_type, export_map)
+
+    # NOTE: Verification is unstable. Try catch to emit information for debugging.
+    try:
+        # NOTE: Input might be dce'ed, so we need to remove those from the input args.
+        new_input_names = {v.debugName() for v in graph.inputs()}
+        new_input_args = []
+        for v, arg in zip(original_jit_graph.inputs(), input_args):
+            if v.debugName() in new_input_names:
+                new_input_args.append(arg)
+        input_args = tuple(new_input_args)
+
+        onnx_inputs = _prepare_input_for_onnx(
+            input_args,
+            {},
+            verification_options.remained_onnx_input_idx,
+            verification_options.flatten,
+        )
+
+        onnx_session = _onnx_backend_session(model_f, verification_options.backend)
+        onnx_outs = _run_onnx(onnx_session, onnx_inputs)
+        del onnx_session  # To free device memory
+
+        try:
+            _compare_onnx_pytorch_outputs(
+                onnx_outs=onnx_outs,
+                pt_outs=jit_outs,
+                options=verification_options,
+            )
+        except AssertionError as e:
+            return e, graph, jit_outs, onnx_outs
+
+        return None, graph, jit_outs, onnx_outs
+
+    except Exception as e:
+        print("Unexpected error during verification.")
+        print("jit graph: ", original_jit_graph)
+        print("onnx graph: ", graph)
+        raise e
+
+
+class GraphInfoPrettyPrinter:
+    graph_info: GraphInfo | None
+    upper_printer: GraphInfoPrettyPrinter | None
+    lower_printer: GraphInfoPrettyPrinter | None
+
+    graph_str_lambdas: Mapping[int, str]
+    connector_str_lambdas: Mapping[int, str]
+    children_str_lambdas: Mapping[int, str]
+
+    def __init__(self, graph_info: GraphInfo | None):
+        self.graph_info = graph_info
+        if (
+            graph_info is not None
+            and graph_info.upper_graph_info is not None
+            and graph_info.lower_graph_info is not None
+        ):
+            self.upper_printer = GraphInfoPrettyPrinter(graph_info.upper_graph_info)
+            self.lower_printer = GraphInfoPrettyPrinter(graph_info.lower_graph_info)
+        else:
+            self.upper_printer = None
+            self.lower_printer = None
+
+    def _total_rows(self) -> int:
+        if self.graph_info is None:
+            return 1
+        if self.upper_printer and self.lower_printer:
+            return (
+                self.upper_printer._total_rows() + self.lower_printer._total_rows() + 1
+            )
+        return 2  # Two lines: node count + id.
+
+    def _node_count_segment_str(self) -> str:
+        if self.graph_info is None:
+            return "..."
+        node_count = self.graph_info.essential_node_count()
+        has_mismatch = self.graph_info.has_mismatch()
+        error_node_kind = (
+            f"({self.graph_info.essential_node_kinds().pop()})"
+            if node_count == 1 and has_mismatch
+            else ""
+        )
+
+        return f"{node_count} {'X' if has_mismatch else chr(0x2713)} {error_node_kind}"
+
+    def _graph_id_segment_str(self) -> str:
+        if self.graph_info is None:
+            return ""
+        return f"id: {self.graph_info.id}"
+
+    def _max_segment_columns(self) -> int:
+        return max(
+            map(len, (self._node_count_segment_str(), self._graph_id_segment_str()))
+        )
+
+    def _graph_segment_str_at_line(self, line: int) -> str:
+        """Get the string representation of the graph segment at the given line."""
+        if line == 0:
+            result_str = self._node_count_segment_str()
+            result_str += " " * (self._max_segment_columns() - len(result_str))
+            return result_str
+        if line == 1:
+            result_str = self._graph_id_segment_str()
+            result_str += " " * (self._max_segment_columns() - len(result_str))
+            return result_str
+        if 0 <= line < self._total_rows():
+            return " " * self._max_segment_columns()
+        return ""
+
+    def _connector_segment_str_at_line(self, line: int) -> str:
+        """Get the connector segment string at the given line."""
+        if self.upper_printer is None and self.lower_printer is None:
+            return ""
+        upper_total_rows = self.upper_printer._total_rows() if self.upper_printer else 1
+        lower_total_rows = self.lower_printer._total_rows() if self.lower_printer else 1
+        if line == 0:
+            return "  __"
+        elif line < upper_total_rows + 1:
+            return " |  "
+        elif line == upper_total_rows + 1:
+            return " |__"
+        elif line < upper_total_rows + lower_total_rows + 1:
+            return "    "
+        return ""
+
+    def _children_str_at_line(self, line: int) -> str:
+        """Get the string representation of the children at the given line.
+
+        Recursively calls `_str_at_line` on children nodes.
+        """
+        if self.upper_printer is None and self.lower_printer is None:
+            return ""
+        upper_total_rows = self.upper_printer._total_rows() if self.upper_printer else 1
+        lower_total_rows = self.lower_printer._total_rows() if self.lower_printer else 1
+        if 0 <= line < upper_total_rows:
+            return (
+                self.upper_printer._str_at_line(line) if self.upper_printer else "..."
+            )
+        elif upper_total_rows < line < upper_total_rows + lower_total_rows + 1:
+            return (
+                self.lower_printer._str_at_line(line - upper_total_rows - 1)
+                if self.lower_printer
+                else "..."
+            )
+        return ""
+
+    def _str_at_line(self, line: int) -> str:
+        """Get the string representation of the graph at the given line."""
+        return (
+            self._graph_segment_str_at_line(line)
+            + self._connector_segment_str_at_line(line)
+            + self._children_str_at_line(line)
+        )
+
+    def pretty_print(self):
+        if self.graph_info is None:
+            print(None)
+            return
+        # Print tree.
+        print(" Tree: ".center(80, "="))
+        total_rows = self._total_rows()
+        for line in range(total_rows):
+            print(self._str_at_line(line).rstrip())
+        if self.graph_info.has_mismatch():
+            # Summarize leaf subgraphs with mismatch.
+            print(" Mismatch leaf subgraphs: ".center(80, "="))
+            print(
+                [
+                    graph_info.id
+                    for graph_info in self.graph_info.all_mismatch_leaf_graph_info()
+                ]
+            )
+            # Summarize node kinds with mismatch.
+            mismatch_node_kinds: dict[str, int] = {}
+            for graph_info in self.graph_info.all_mismatch_leaf_graph_info():
+                node_kinds = graph_info.essential_node_kinds()
+                if len(node_kinds) == 1:
+                    node_kind = node_kinds.pop()
+                    mismatch_node_kinds[node_kind] = (
+                        mismatch_node_kinds.get(node_kind, 0) + 1
+                    )
+            print(" Mismatch node kinds: ".center(80, "="))
+            print(mismatch_node_kinds)
+        else:
+            print(" No mismatch found. ".center(80, "="))
+
+
+class OnnxTestCaseRepro:
+    def __init__(self, repro_dir):
+        self.repro_dir = repro_dir
+        self.proto, self.inputs, self.outputs = onnx_proto_utils.load_test_case(
+            repro_dir
+        )
+
+    @classmethod
+    def create_test_case_repro(
+        cls, proto: bytes, inputs, outputs, dir: str, name: str | None = None
+    ):
+        """Create a repro under "{dir}/test_{name}" for an ONNX test case.
+
+        The test case contains the model and the inputs/outputs data. The directory
+        structure is as follows:
+
+        dir
+        \u251c\u2500\u2500 test_<name>
+        \u2502   \u251c\u2500\u2500 model.onnx
+        \u2502   \u2514\u2500\u2500 test_data_set_0
+        \u2502       \u251c\u2500\u2500 input_0.pb
+        \u2502       \u251c\u2500\u2500 input_1.pb
+        \u2502       \u251c\u2500\u2500 output_0.pb
+        \u2502       \u2514\u2500\u2500 output_1.pb
+
+        Args:
+            proto: ONNX model proto.
+            inputs: Inputs to the model.
+            outputs: Outputs of the model.
+            dir: Directory to save the repro.
+            name: Name of the test case. If not specified, a name based on current time
+                will be generated.
+        Returns:
+            Path to the repro.
+        """
+        if name is None:
+            name = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S_%f")
+        return onnx_proto_utils.export_as_test_case(
+            proto,
+            _to_numpy(inputs),
+            _to_numpy(outputs),
+            name,
+            dir,
+        )
+
+    def validate(self, options: VerificationOptions):
+        """Run the ONNX test case with options.backend, and compare with the expected outputs.
+
+        Args:
+            options: Options for validation.
+
+        Raise:
+            AssertionError: if outputs from options.backend and expected outputs are not
+                equal up to specified precision.
+        """
+        onnx_session = _onnx_backend_session(io.BytesIO(self.proto), options.backend)
+        run_outputs = onnx_session.run(None, self.inputs)
+        if hasattr(onnx_session, "get_outputs"):
+            output_names = [o.name for o in onnx_session.get_outputs()]
+        elif hasattr(onnx_session, "output_names"):
+            output_names = onnx_session.output_names
+        else:
+            raise ValueError(f"Unknown onnx session type: {type(onnx_session)}")
+        expected_outs = [self.outputs[name] for name in output_names]
+        _compare_onnx_pytorch_outputs_in_np(run_outputs, expected_outs, options)
+
+
+@dataclasses.dataclass
+class GraphInfo:
+    """GraphInfo contains validation information of a TorchScript graph and its converted ONNX graph."""
+
+    graph: torch.Graph
+    input_args: tuple[Any, ...]
+    params_dict: dict[str, Any]
+    export_options: _experimental.ExportOptions = dataclasses.field(
+        default_factory=_experimental.ExportOptions
+    )
+    mismatch_error: AssertionError | None = dataclasses.field(default=None, init=False)
+    pt_outs: Sequence[_NumericType] | None = dataclasses.field(default=None, init=False)
+    upper_graph_info: GraphInfo | None = dataclasses.field(default=None, init=False)
+    lower_graph_info: GraphInfo | None = dataclasses.field(default=None, init=False)
+    id: str = dataclasses.field(default="")
+    _onnx_graph: torch.Graph | None = dataclasses.field(init=False, default=None)
+
+    _EXCLUDED_NODE_KINDS: frozenset[str] = frozenset(
+        {"prim::Constant", "prim::ListConstruct", "aten::ScalarImplicit"}
+    )
+
+    def clear(self):
+        """Clear states and results of previous verification."""
+        self.mismatch_error = None
+        self.pt_outs = None
+        self._onnx_graph = None
+        self.upper_graph_info = None
+        self.lower_graph_info = None
+
+    def pretty_print_tree(self):
+        """Pretty print `GraphInfo` tree.
+
+        Each node represents a subgraph, showing the number of nodes in the subgraph and
+        a check mark if the subgraph has output mismatch between torch and ONNX.
+
+        The id of the subgraph is shown under the node. The `GraphInfo` object for any
+        subgraph can be retrieved by calling `graph_info.find_partition(id)`.
+
+        Example::
+
+            ==================================== Tree: =====================================
+            5 X   __2 X    __1 \u2713
+            id:  |  id: 0 |  id: 00
+                 |        |
+                 |        |__1 X (aten::relu)
+                 |           id: 01
+                 |
+                 |__3 X    __1 \u2713
+                    id: 1 |  id: 10
+                          |
+                          |__2 X     __1 X (aten::relu)
+                             id: 11 |  id: 110
+                                    |
+                                    |__1 \u2713
+                                       id: 111
+            =========================== Mismatch leaf subgraphs: ===========================
+            ['01', '110']
+            ============================= Mismatch node kinds: =============================
+            {'aten::relu': 2}
+
+        """
+        GraphInfoPrettyPrinter(self).pretty_print()
+
+    def pretty_print_mismatch(self, graph: bool = False):
+        """Pretty print details of the mismatch between torch and ONNX.
+
+        Args:
+            graph: If True, print the ATen JIT graph and ONNX graph.
+        """
+        print(f" Mismatch info for graph partition {self.id}: ".center(80, "="))
+        if graph:
+            print(" ATen JIT graph ".center(80, "="))
+            # TODO: A more compact graph printer.
+            #   * Drop stride, grad, device information.
+            #   * Show source location on a separate line.
+            print(self.graph)
+            if self._onnx_graph is not None:
+                print(" ONNX graph ".center(80, "="))
+                print(self._onnx_graph)
+        if self.has_mismatch():
+            print(" Mismatch error ".center(80, "="))
+            print(self.mismatch_error)
+        else:
+            print(" No mismatch ".center(80, "="))
+
+    def has_mismatch(self) -> bool:
+        """Return True if the subgraph has output mismatch between torch and ONNX."""
+        return self.mismatch_error is not None
+
+    def essential_node_count(self) -> int:
+        """Return the number of nodes in the subgraph excluding those in `_EXCLUDED_NODE_KINDS`."""
+        return sum(
+            1 for n in self.graph.nodes() if n.kind() not in self._EXCLUDED_NODE_KINDS
+        )
+
+    def essential_node_kinds(self) -> set[str]:
+        """Return the set of node kinds in the subgraph excluding those in `_EXCLUDED_NODE_KINDS`."""
+        return {
+            n.kind()
+            for n in self.graph.nodes()
+            if n.kind() not in self._EXCLUDED_NODE_KINDS
+        }
+
+    def all_mismatch_leaf_graph_info(self) -> list[GraphInfo]:
+        """Return a list of all leaf `GraphInfo` objects that have mismatch."""
+        if not self.has_mismatch():
+            return []
+
+        no_mismatch_children = (
+            self.upper_graph_info is None or not self.upper_graph_info.has_mismatch()
+        ) and (
+            self.lower_graph_info is None or not self.lower_graph_info.has_mismatch()
+        )
+
+        if no_mismatch_children:
+            return [self]
+
+        results = []
+        if self.upper_graph_info is not None:
+            results += self.upper_graph_info.all_mismatch_leaf_graph_info()
+        if self.lower_graph_info is not None:
+            results += self.lower_graph_info.all_mismatch_leaf_graph_info()
+
+        return results
+
+    def find_partition(self, id: str) -> GraphInfo | None:
+        """Find the `GraphInfo` object with the given id."""
+        if id == self.id:
+            return self
+        current_length = len(self.id)
+        if len(id) > current_length:
+            if id[current_length] == "0" and self.upper_graph_info is not None:
+                return self.upper_graph_info.find_partition(id)
+            elif id[current_length] == "1" and self.lower_graph_info is not None:
+                return self.lower_graph_info.find_partition(id)
+        return None
+
+    def export_repro(
+        self, repro_dir: str | None = None, name: str | None = None
+    ) -> str:
+        """Export the subgraph to ONNX along with the input/output data for repro.
+
+        The repro directory will contain the following files::
+
+            dir
+            \u251c\u2500\u2500 test_<name>
+            \u2502   \u251c\u2500\u2500 model.onnx
+            \u2502   \u2514\u2500\u2500 test_data_set_0
+            \u2502       \u251c\u2500\u2500 input_0.pb
+            \u2502       \u251c\u2500\u2500 input_1.pb
+            \u2502       \u251c\u2500\u2500 output_0.pb
+            \u2502       \u2514\u2500\u2500 output_1.pb
+
+        Args:
+            repro_dir: The directory to export the repro files to. Defaults to current
+                working directory if None.
+            name: An optional name for the test case folder: "test_{name}".
+
+        Returns:
+            The path to the exported repro directory.
+        """
+
+        if repro_dir is None:
+            repro_dir = os.getcwd()
+        repro_dir = os.path.join(repro_dir, "onnx_debug")
+
+        onnx_graph, onnx_params_dict = _onnx_graph_from_aten_graph(
+            self.graph, self.export_options, self.params_dict
+        )
+
+        proto, _ = _onnx_proto_from_onnx_graph(
+            onnx_graph, self.export_options, onnx_params_dict
+        )
+        return OnnxTestCaseRepro.create_test_case_repro(
+            proto, self.input_args, self.pt_outs, repro_dir, name
+        )
+
+    def _graph_partition_pivot(self) -> int:
+        """Find the pivot index to partition the graph.
+
+        The pivot is the node that splits the graph into two parts. Each part should
+        have the similar amount of nodes, excluding non essential ops, defined in
+        `_EXCLUDED_NODE_KINDS`, such as `prim::Constant`.
+        If the graph has an odd number of nodes, the upper part will have one more node.
+        If the graph does not have any node that can be partitioned, return -1.
+
+        Returns:
+            The index of the pivot node.
+        """
+        included_node_indices = [
+            i
+            for i, n in enumerate(self.graph.nodes())
+            if n.kind() not in self._EXCLUDED_NODE_KINDS
+        ]
+        half_idx = len(included_node_indices) // 2 - 1
+        if half_idx >= 0 and len(included_node_indices) > half_idx:
+            return included_node_indices[half_idx] + 1
+        return -1
+
+    def _partition_upper_graph(self) -> torch.Graph:
+        pivot = self._graph_partition_pivot()
+        if pivot == -1:
+            return torch.Graph()
+        graph = self.graph.copy()  # Copy to not mutate parent graph.
+        original_outputs = list(graph.outputs())
+
+        def _process_bridge_value_for_upper(
+            new_outputs: list[torch.Value], bridge_value: torch.Value
+        ) -> torch.Value:
+            # Add bridge values as upper graph outputs.
+            new_outputs.append(bridge_value)
+            return bridge_value
+
+        new_outputs: list[torch.Value] = []
+        process_bridge_value_for_upper = functools.partial(
+            _process_bridge_value_for_upper, new_outputs
+        )
+        _, dropped_nodes, complete_upper_nodes_set, _ = self._partition_nodes(
+            graph, pivot, process_bridge_value_for_upper
+        )
+
+        for _ in enumerate(original_outputs):
+            graph.eraseOutput(0)
+        for output in new_outputs:
+            graph.registerOutput(output)
+
+        for node in reversed(dropped_nodes):
+            node.destroy()
+
+        for i, input in reversed(list(enumerate(list(graph.inputs())))):
+            if (
+                not _has_uses_by_nodes(input, complete_upper_nodes_set)
+                and input not in new_outputs
+            ):
+                try:
+                    graph.eraseInput(i)
+                except RuntimeError as e:
+                    print(input, graph)
+                    raise e
+
+        return graph
+
+    def _partition_lower_graph(self) -> torch.Graph:
+        pivot = self._graph_partition_pivot()
+        if pivot == -1:
+            return torch.Graph()
+        graph = self.graph.copy()  # Copy to not mutate parent graph.
+        original_outputs = list(graph.outputs())
+        original_inputs = list(graph.inputs())
+
+        new_outputs = []
+
+        def _process_bridge_value_for_lower(
+            graph: torch.Graph, bridge_value: torch.Value
+        ) -> torch.Value:
+            # Add bridge values as lower graph inputs.
+            new_input = graph.addInput()
+            bridge_value.replaceAllUsesWith(new_input)
+            new_input.copyMetadata(bridge_value)
+            return new_input
+
+        process_bridge_value_for_lower = functools.partial(
+            _process_bridge_value_for_lower, graph
+        )
+
+        upper_nodes, lower_nodes, _, complete_lower_nodes_set = self._partition_nodes(
+            graph, pivot, process_bridge_value_for_lower
+        )
+
+        for output in original_outputs:
+            if _produced_by(output, lower_nodes):
+                new_outputs.append(output)
+        for _ in enumerate(original_outputs):
+            graph.eraseOutput(0)
+        for output in new_outputs:
+            graph.registerOutput(output)
+
+        for input in original_inputs:
+            if _has_uses_by_nodes(input, complete_lower_nodes_set):
+                new_input = graph.addInput()
+                input.replaceAllUsesWith(new_input)
+                new_input.copyMetadata(input)
+
+        for node in reversed(upper_nodes):
+            if node not in complete_lower_nodes_set:
+                try:
+                    node.destroy()
+                except RuntimeError as e:
+                    print(node, graph)
+                    raise e
+
+        for _ in original_inputs:
+            graph.eraseInput(0)
+
+        return graph
+
+    def _partition_node(
+        self,
+        node: torch.Node,
+        complete_upper_nodes_set: set[torch.Node],
+        complete_lower_nodes_set: set[torch.Node],
+        original_graph_outputs: set[torch.Value],
+        covered_bridge_values: set[torch.Value],
+        process_bridge_value: Callable[[torch.Value], torch.Value],
+    ):
+        if node in complete_lower_nodes_set:
+            return
+
+        if (
+            _node_has_uses_by(node, complete_lower_nodes_set)
+            and node.kind() in self._EXCLUDED_NODE_KINDS
+        ):
+            complete_lower_nodes_set.update(_all_nodes([node]))
+            for input in node.inputs():
+                if input in covered_bridge_values:
+                    continue
+                self._partition_node(
+                    input.node(),
+                    complete_upper_nodes_set,
+                    complete_lower_nodes_set,
+                    original_graph_outputs,
+                    covered_bridge_values,
+                    process_bridge_value,
+                )
+        else:
+            for output in node.outputs():
+                if output in covered_bridge_values:
+                    continue
+                if (
+                    _has_uses_by_nodes(output, complete_lower_nodes_set)
+                    or output in original_graph_outputs
+                ):
+                    covered_bridge_values.add(process_bridge_value(output))
+
+    def _partition_nodes(
+        self,
+        graph: torch.Graph,
+        pivot: int,
+        process_bridge_value: Callable[[torch.Value], torch.Value],
+    ) -> tuple[list[torch.Node], list[torch.Node], set[torch.Node], set[torch.Node]]:
+        nodes = list(graph.nodes())
+        upper_nodes = nodes[:pivot]
+        lower_nodes = nodes[pivot:]
+        # `upper_nodes` and `complete_upper_nodes_set` differs in that the latter
+        # recursively contains nodes in subblock of `upper_nodes`.
+        # The same applies for `lower_nodes` and `complete_lower_nodes_set`.
+        # With addition that `complete_lower_nodes_set` will include nodes that
+        # are determined to be copied from `upper_nodes` to `lower_nodes`.
+        complete_upper_nodes_set = _all_nodes(upper_nodes)
+        complete_lower_nodes_set = _all_nodes(lower_nodes)
+        original_graph_outputs = set(graph.outputs())
+        # Bridge values are values produced from upper graph, and consumed
+        # by lower graph. These values need to be become upper graph outputs
+        # and lower graph inputs, to bridge the interaction.
+        # Start with all graph inputs marked as covered. If any graph input is
+        # needed by lower graph, just keep it in lower graph inputs later.
+        covered_bridge_values = set(graph.inputs())
+        for node in upper_nodes:
+            self._partition_node(
+                node,
+                complete_upper_nodes_set,
+                complete_lower_nodes_set,
+                original_graph_outputs,
+                covered_bridge_values,
+                process_bridge_value,
+            )
+        return (
+            upper_nodes,
+            lower_nodes,
+            complete_upper_nodes_set,
+            complete_lower_nodes_set,
+        )
+
+    def _bridge_kwargs(self):
+        pt_outs = self.pt_outs
+        graph_outputs = list(self.graph.outputs())
+        assert pt_outs is not None
+        assert len(graph_outputs) == len(
+            pt_outs
+        ), f"{len(graph_outputs)} vs {len(pt_outs)}\nGraph: {self.graph}"
+        return {v.debugName(): o for v, o in zip(graph_outputs, pt_outs)}
+
+    def _args_and_params_for_partition_graph(
+        self,
+        graph: torch.Graph,
+        bridge_kwargs: Mapping[str, _NumericType | Sequence[_NumericType]],
+        full_kwargs: Mapping[str, torch.Tensor],
+        full_params: Mapping[str, torch.Tensor],
+    ):
+        input_names = [input.debugName() for input in graph.inputs()]
+        args = tuple(bridge_kwargs[k] for k in input_names if k in bridge_kwargs)
+        args += tuple(full_kwargs[k] for k in input_names if k in full_kwargs)
+        params = {k: full_params[k] for k in input_names if k in full_params}
+        assert len(args) + len(params) == len(
+            input_names
+        ), f"{len(args)} + {len(params)} vs {len(input_names)}: {input_names}"
+        return args, params
+
+    def verify_export(
+        self, options: VerificationOptions
+    ) -> tuple[AssertionError | None, torch.Graph, _OutputsType, _OutputsType]:
+        """
+        Verify the export from TorchScript IR graph to ONNX.
+
+        Export the TorchScript IR graph to ONNX, with the inputs, parameters and export
+        options recorded in this object. Then verify the exported ONNX graph against
+        the original TorchScript IR graph under the provided verification options.
+
+        Args:
+            options: The verification options.
+
+        Returns:
+            error: The AssertionError raised during the verification. Returns None if no
+            error is raised.
+            onnx_graph: The exported ONNX graph in TorchScript IR format.
+            onnx_outs: The outputs from running exported ONNX model under the onnx
+            backend in `options`.
+            pt_outs: The outputs from running the TorchScript IR graph.
+        """
+        return verify_aten_graph(
+            self.graph,
+            input_args=self.input_args,
+            params_dict=self.params_dict,
+            export_options=self.export_options,
+            verification_options=options,
+        )
+
+    def find_mismatch(
+        self,
+        options: VerificationOptions | None = None,
+    ):
+        """
+        Find all mismatches between the TorchScript IR graph and the exported onnx model.
+
+        Binary searches the model graph to find the minimal subgraph that exhibits the
+        mismatch. A `GraphInfo` object is created for each subgraph, recording the test
+        inputs and export options, as well as the validation results.
+
+        Args:
+            options: The verification options.
+        """
+        self.clear()
+
+        if options is None:
+            options = VerificationOptions()
+
+        if self.export_options.verbose:
+            print(self.graph)
+
+        if len(list(self.graph.outputs())) == 0:
+            return
+
+        assert len(self.input_args) + len(self.params_dict) == len(
+            list(self.graph.inputs())
+        ), (
+            f"Number of graph inputs({len(list(self.graph.inputs()))}) does not match "
+            f"the provided tensor arguments({len(self.input_args)} + {len(self.params_dict)})."
+        )
+
+        self.mismatch_error, self._onnx_graph, self.pt_outs, _ = self.verify_export(
+            options
+        )
+
+        if self.mismatch_error is None:
+            # No mismatch found in graph.
+            return
+
+        if self.essential_node_count() <= 1:
+            # Reached leaf node, no more partitioning.
+            return
+
+        full_kwargs = {
+            k.debugName(): v for k, v in zip(self.graph.inputs(), self.input_args)
+        }
+        full_params = self.params_dict
+
+        upper_graph = self._partition_upper_graph()
+        upper_args, upper_params = self._args_and_params_for_partition_graph(
+            upper_graph, {}, full_kwargs, full_params
+        )
+        self.upper_graph_info = GraphInfo(
+            upper_graph,
+            upper_args,
+            upper_params,
+            self.export_options,
+            id=self.id + "0",
+        )
+
+        self.upper_graph_info.find_mismatch(options)
+
+        bridge_kwargs = self.upper_graph_info._bridge_kwargs()
+        lower_graph = self._partition_lower_graph()
+        lower_args, lower_params = self._args_and_params_for_partition_graph(
+            lower_graph, bridge_kwargs, full_kwargs, full_params
+        )
+        self.lower_graph_info = GraphInfo(
+            lower_graph,
+            lower_args,
+            lower_params,
+            self.export_options,
+            id=self.id + "1",
+        )
+
+        self.lower_graph_info.find_mismatch(options)
+
+
+def _all_nodes(nodes: Collection[torch.Node]) -> set[torch.Node]:
+    all_nodes = set(nodes)
+    for n in nodes:
+        for b in n.blocks():
+            all_nodes.update(_all_nodes(list(b.nodes())))
+    return all_nodes
+
+
+def _has_uses_by_nodes(value: torch.Value, nodes: Collection[torch.Node]) -> bool:
+    return any(use.user in nodes for use in value.uses())
+
+
+def _node_has_uses_by(node: torch.Node, nodes: Collection[torch.Node]) -> bool:
+    for output in node.outputs():
+        if _has_uses_by_nodes(output, nodes):
+            return True
+    return False
+
+
+def _produced_by(value: torch.Value, nodes: Collection[torch.Node]) -> bool:
+    return value.node() in nodes
+
+
+def find_mismatch(
+    model: torch.nn.Module | torch.jit.ScriptModule,
+    input_args: tuple[Any, ...],
+    do_constant_folding: bool = True,
+    training: _C_onnx.TrainingMode = _C_onnx.TrainingMode.EVAL,
+    opset_version: int | None = None,
+    keep_initializers_as_inputs: bool = True,
+    verbose: bool = False,
+    options: VerificationOptions | None = None,
+) -> GraphInfo:
+    r"""Find all mismatches between the original model and the exported model.
+
+    Experimental. The API is subject to change.
+
+    This tool helps debug the mismatch between the original PyTorch model and exported
+    ONNX model. It binary searches the model graph to find the minimal subgraph that
+    exhibits the mismatch.
+
+    Args:
+        model: The model to be exported.
+        input_args: The input arguments to the model.
+        do_constant_folding: Same as `do_constant_folding` in :func:`torch.onnx.export`.
+        training: Same as `training` in :func:`torch.onnx.export`.
+        opset_version: Same as `opset_version` in :func:`torch.onnx.export`.
+        keep_initializers_as_inputs: Same as `keep_initializers_as_inputs` in :func:`torch.onnx.export`.
+        verbose: Same as `verbose` in :func:`torch.onnx.export`.
+        options: The options for the mismatch verification.
+
+    Returns:
+        A GraphInfo object that contains the mismatch information.
+
+    Example::
+
+        >>> import torch
+        >>> import torch.onnx.verification
+        >>> torch.manual_seed(0)
+        >>> opset_version = 15
+        >>> # Define a custom symbolic function for aten::relu.
+        >>> # The custom symbolic function is incorrect, which will result in mismatches.
+        >>> def incorrect_relu_symbolic_function(g, self):
+        ...     return self
+        >>> torch.onnx.register_custom_op_symbolic(
+        ...     "aten::relu",
+        ...     incorrect_relu_symbolic_function,
+        ...     opset_version=opset_version,
+        ... )
+        >>> class Model(torch.nn.Module):
+        ...     def __init__(self) -> None:
+        ...         super().__init__()
+        ...         self.layers = torch.nn.Sequential(
+        ...             torch.nn.Linear(3, 4),
+        ...             torch.nn.ReLU(),
+        ...             torch.nn.Linear(4, 5),
+        ...             torch.nn.ReLU(),
+        ...             torch.nn.Linear(5, 6),
+        ...         )
+        ...     def forward(self, x):
+        ...         return self.layers(x)
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_ONNX)
+        >>> graph_info = torch.onnx.verification.find_mismatch(
+        ...     Model(),
+        ...     (torch.randn(2, 3),),
+        ...     opset_version=opset_version,
+        ... )
+        ===================== Mismatch info for graph partition : ======================
+        ================================ Mismatch error ================================
+        Tensor-likes are not close!
+        Mismatched elements: 12 / 12 (100.0%)
+        Greatest absolute difference: 0.2328854203224182 at index (1, 2) (up to 1e-07 allowed)
+        Greatest relative difference: 0.699536174352349 at index (1, 3) (up to 0.001 allowed)
+        ==================================== Tree: =====================================
+        5 X   __2 X    __1 \u2713
+        id:  |  id: 0 |  id: 00
+             |        |
+             |        |__1 X (aten::relu)
+             |           id: 01
+             |
+             |__3 X    __1 \u2713
+                id: 1 |  id: 10
+                      |
+                      |__2 X     __1 X (aten::relu)
+                         id: 11 |  id: 110
+                                |
+                                |__1 \u2713
+                                   id: 111
+        =========================== Mismatch leaf subgraphs: ===========================
+        ['01', '110']
+        ============================= Mismatch node kinds: =============================
+        {'aten::relu': 2}
+
+    """
+    if options is None:
+        options = VerificationOptions()
+    if opset_version is None:
+        opset_version = _constants.ONNX_DEFAULT_OPSET
+    """From aten graph, do binary search on graph partition to find operator export discrepancy."""
+    # TODO: Copied from utils.py `export` until `_optimize_graph`.
+    if training == torch.onnx.TrainingMode.TRAINING:
+        model.train()
+    elif training == torch.onnx.TrainingMode.EVAL:
+        model.eval()
+    with torch.no_grad():
+        inputs_for_export = _prepare_input_for_export(input_args, {})
+        args = utils._decide_input_format(model, inputs_for_export)
+
+        model = utils._pre_trace_quant_model(model, args)
+        graph, params, torch_out, module = utils._create_jit_graph(model, args)
+        params_dict = utils._get_named_param_dict(graph, params)
+
+        utils._apply_friendly_debug_names(graph, params_dict)
+
+        graph_info = GraphInfo(
+            graph,
+            input_args,
+            params_dict,
+            _experimental.ExportOptions(
+                do_constant_folding=do_constant_folding,
+                training=training,
+                opset_version=opset_version,
+                keep_initializers_as_inputs=keep_initializers_as_inputs,
+                verbose=verbose,
+            ),
+        )
+        graph_info.find_mismatch(options)
+        graph_info.pretty_print_mismatch()
+        graph_info.pretty_print_tree()
+
+        return graph_info