Spaces:

Adi-69s
/

Image-Generator

Sleeping

App Files Files Community

Adi-69s commited on May 1, 2024

Commit

b2659ad

verified ·

1 Parent(s): 0000c2e

Upload 5061 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +15 -0
torch/_C.cp310-win_amd64.pyd +0 -0
torch/_C/_VariableFunctions.pyi +0 -0
torch/_C/__init__.pyi +0 -0
torch/_C/_autograd.pyi +123 -0
torch/_C/_cpu.pyi +5 -0
torch/_C/_cudnn.pyi +17 -0
torch/_C/_distributed_autograd.pyi +26 -0
torch/_C/_distributed_c10d.pyi +478 -0
torch/_C/_distributed_rpc.pyi +188 -0
torch/_C/_distributed_rpc_testing.pyi +35 -0
torch/_C/_functions.pyi +11 -0
torch/_C/_functorch.pyi +71 -0
torch/_C/_itt.pyi +5 -0
torch/_C/_lazy.pyi +28 -0
torch/_C/_lazy_ts_backend.pyi +11 -0
torch/_C/_monitor.pyi +44 -0
torch/_C/_nn.pyi +86 -0
torch/_C/_nvtx.pyi +6 -0
torch/_C/_onnx.pyi +38 -0
torch/_C/_profiler.pyi +238 -0
torch/_C/_verbose.pyi +3 -0
torch/_VF.py +30 -0
torch/_VF.pyi +0 -0
torch/__config__.py +22 -0
torch/__future__.py +21 -0
torch/_appdirs.py +666 -0
torch/_awaits/__init__.py +54 -0
torch/_awaits/__pycache__/__init__.cpython-310.pyc +0 -0
torch/_classes.py +55 -0
torch/_compile.py +30 -0
torch/_custom_op/__init__.py +0 -0
torch/_custom_op/__pycache__/__init__.cpython-310.pyc +0 -0
torch/_custom_op/__pycache__/autograd.cpython-310.pyc +0 -0
torch/_custom_op/__pycache__/functional.cpython-310.pyc +0 -0
torch/_custom_op/__pycache__/impl.cpython-310.pyc +0 -0
torch/_custom_op/autograd.py +274 -0
torch/_custom_op/functional.py +187 -0
torch/_custom_op/impl.py +976 -0
torch/_custom_ops.py +322 -0
torch/_decomp/__init__.py +444 -0
torch/_decomp/__pycache__/__init__.cpython-310.pyc +0 -0
torch/_decomp/__pycache__/decompositions.cpython-310.pyc +0 -0
torch/_decomp/__pycache__/decompositions_for_jvp.cpython-310.pyc +0 -0
torch/_decomp/__pycache__/decompositions_for_rng.cpython-310.pyc +0 -0
torch/_decomp/decompositions.py +0 -0
torch/_decomp/decompositions_for_jvp.py +302 -0
torch/_decomp/decompositions_for_rng.py +263 -0
torch/_deploy.py +105 -0
torch/_dispatch/__init__.py +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+torch/bin/fbgemm.dll filter=lfs diff=lfs merge=lfs -text
+torch/bin/protoc.exe filter=lfs diff=lfs merge=lfs -text
+torch/lib/dnnl.lib filter=lfs diff=lfs merge=lfs -text
+torch/lib/fbgemm.dll filter=lfs diff=lfs merge=lfs -text
+torch/lib/fbgemm.lib filter=lfs diff=lfs merge=lfs -text
+torch/lib/fmt.lib filter=lfs diff=lfs merge=lfs -text
+torch/lib/kineto.lib filter=lfs diff=lfs merge=lfs -text
+torch/lib/libiomp5md.dll filter=lfs diff=lfs merge=lfs -text
+torch/lib/libprotobuf-lite.lib filter=lfs diff=lfs merge=lfs -text
+torch/lib/libprotobuf.lib filter=lfs diff=lfs merge=lfs -text
+torch/lib/libprotoc.lib filter=lfs diff=lfs merge=lfs -text
+torch/lib/torch_cpu.dll filter=lfs diff=lfs merge=lfs -text
+torch/lib/torch_cpu.lib filter=lfs diff=lfs merge=lfs -text
+torch/lib/torch_python.dll filter=lfs diff=lfs merge=lfs -text
+torch/lib/XNNPACK.lib filter=lfs diff=lfs merge=lfs -text

torch/_C.cp310-win_amd64.pyd ADDED Viewed

Binary file (10.2 kB). View file

torch/_C/_VariableFunctions.pyi ADDED Viewed

The diff for this file is too large to render. See raw diff

torch/_C/__init__.pyi ADDED Viewed

The diff for this file is too large to render. See raw diff

torch/_C/_autograd.pyi ADDED Viewed

	@@ -0,0 +1,123 @@

+from enum import Enum
+from typing import Any, Callable, List, Optional, Set
+import torch
+from ._profiler import (
+    _ProfilerEvent,
+    ActiveProfilerType,
+    ProfilerActivity,
+    ProfilerConfig,
+)
+# Defined in tools/autograd/init.cpp
+class DeviceType(Enum):
+    CPU = ...
+    CUDA = ...
+    MKLDNN = ...
+    OPENGL = ...
+    OPENCL = ...
+    IDEEP = ...
+    HIP = ...
+    FPGA = ...
+    ORT = ...
+    XLA = ...
+    MPS = ...
+    HPU = ...
+    Meta = ...
+    Vulkan = ...
+    Metal = ...
+    PrivateUse1 = ...
+class ProfilerEvent:
+    def cpu_elapsed_us(self, other: ProfilerEvent) -> float: ...
+    def cpu_memory_usage(self) -> int: ...
+    def cuda_elapsed_us(self, other: ProfilerEvent) -> float: ...
+    def privateuse1_elapsed_us(self, other: ProfilerEvent) -> float: ...
+    def cuda_memory_usage(self) -> int: ...
+    def device(self) -> int: ...
+    def handle(self) -> int: ...
+    def has_cuda(self) -> bool: ...
+    def is_remote(self) -> bool: ...
+    def kind(self) -> int: ...
+    def name(self) -> str: ...
+    def node_id(self) -> int: ...
+    def sequence_nr(self) -> int: ...
+    def shapes(self) -> List[List[int]]: ...
+    def thread_id(self) -> int: ...
+    def flops(self) -> float: ...
+    def is_async(self) -> bool: ...
+class _KinetoEvent:
+    def name(self) -> str: ...
+    def device_index(self) -> int: ...
+    def start_us(self) -> int: ...
+    def duration_us(self) -> int: ...
+    def is_async(self) -> bool: ...
+    def linked_correlation_id(self) -> int: ...
+    def shapes(self) -> List[List[int]]: ...
+    def dtypes(self) -> List[str]: ...
+    def concrete_inputs(self) -> List[Any]: ...
+    def device_type(self) -> DeviceType: ...
+    def start_thread_id(self) -> int: ...
+    def end_thread_id(self) -> int: ...
+    def correlation_id(self) -> int: ...
+    def fwd_thread_id(self) -> int: ...
+    def stack(self) -> List[str]: ...
+    def scope(self) -> int: ...
+    def sequence_nr(self) -> int: ...
+    def flops(self) -> int: ...
+    def cuda_elapsed_us(self) -> int: ...
+    def privateuse1_elapsed_us(self) -> int: ...
+class _ProfilerResult:
+    def events(self) -> List[_KinetoEvent]: ...
+    def legacy_events(self) -> List[List[ProfilerEvent]]: ...
+    def save(self, path: str) -> None: ...
+    def experimental_event_tree(self) -> List[_ProfilerEvent]: ...
+    def trace_start_us(self) -> int: ...
+class SavedTensor: ...
+def _enable_profiler(
+    config: ProfilerConfig,
+    activities: Set[ProfilerActivity],
+) -> None: ...
+def _prepare_profiler(
+    config: ProfilerConfig,
+    activities: Set[ProfilerActivity],
+) -> None: ...
+def _disable_profiler() -> _ProfilerResult: ...
+def _profiler_enabled() -> bool: ...
+def _add_metadata_json(key: str, value: str) -> None: ...
+def _kineto_step() -> None: ...
+def _get_sequence_nr() -> int: ...
+def kineto_available() -> bool: ...
+def _record_function_with_args_enter(name: str, *args) -> torch.Tensor: ...
+def _record_function_with_args_exit(handle: torch.Tensor) -> None: ...
+def _supported_activities() -> Set[ProfilerActivity]: ...
+def _enable_record_function(enable: bool) -> None: ...
+def _set_empty_test_observer(is_global: bool, sampling_prob: float) -> None: ...
+def _push_saved_tensors_default_hooks(
+    pack_hook: Callable[[torch.Tensor], Any],
+    unpack_hook: Callable[[Any], torch.Tensor],
+) -> None: ...
+def _pop_saved_tensors_default_hooks() -> None: ...
+def _unsafe_set_version_counter(t: torch.Tensor, prev_version: int) -> None: ...
+def _enable_profiler_legacy(config: ProfilerConfig) -> None: ...
+def _disable_profiler_legacy() -> List[List[ProfilerEvent]]: ...
+def _profiler_type() -> ActiveProfilerType: ...
+def _saved_tensors_hooks_enable() -> None: ...
+def _saved_tensors_hooks_disable(message: str) -> None: ...
+def _saved_tensors_hooks_get_disabled_error_message() -> Optional[str]: ...
+class CreationMeta(Enum):
+    DEFAULT = ...
+    IN_CUSTOM_FUNCTION = ...
+    MULTI_OUTPUT_NODE = ...
+    NO_GRAD_MODE = ...
+    INFERENCE_MODE = ...
+def _set_creation_meta(t: torch.Tensor, creation_meta: CreationMeta) -> None: ...
+def _get_creation_meta(t: torch.Tensor) -> CreationMeta: ...

torch/_C/_cpu.pyi ADDED Viewed

	@@ -0,0 +1,5 @@

+from torch.types import _bool
+# Defined in torch/csrc/cpu/Module.cpp
+def _is_cpu_support_vnni() -> _bool: ...

torch/_C/_cudnn.pyi ADDED Viewed

	@@ -0,0 +1,17 @@

+from enum import Enum
+from torch.types import _bool, Tuple
+# Defined in torch/csrc/cuda/shared/cudnn.cpp
+is_cuda: _bool
+def getRuntimeVersion() -> Tuple[int, int, int]: ...
+def getCompileVersion() -> Tuple[int, int, int]: ...
+def getVersionInt() -> int: ...
+class RNNMode(int, Enum):
+    value: int
+    rnn_relu = ...
+    rnn_tanh = ...
+    lstm = ...
+    gru = ...

torch/_C/_distributed_autograd.pyi ADDED Viewed

	@@ -0,0 +1,26 @@

+from typing import Any, Dict, List, Set
+import torch
+# This module is defined in torch/csrc/distributed/autograd/init.cpp
+class DistAutogradContext:
+    def _context_id(self) -> int: ...
+    def _recv_functions(self) -> Dict[int, Any]: ...
+    def _send_functions(self) -> Dict[int, Any]: ...
+    def _known_worker_ids(self) -> Set[int]: ...
+def _new_context() -> DistAutogradContext: ...
+def _release_context(context_id: int) -> None: ...
+def _get_max_id() -> int: ...
+def _is_valid_context(worker_id: int) -> bool: ...
+def _retrieve_context(context_id: int) -> DistAutogradContext: ...
+def _current_context() -> DistAutogradContext: ...
+def _init(worker_id: int) -> None: ...
+def _get_debug_info() -> Dict[str, str]: ...
+def backward(
+    context_id: int,
+    roots: List[torch.Tensor],
+    retain_graph=False,
+) -> None: ...
+def get_gradients(context_id: int) -> Dict[torch.Tensor, torch.Tensor]: ...

torch/_C/_distributed_c10d.pyi ADDED Viewed

	@@ -0,0 +1,478 @@

+# mypy: disable-error-code="type-arg"
+from datetime import timedelta
+from enum import Enum
+from typing import Any, Dict, List, Optional, overload, Tuple, Union
+from torch import Tensor
+from torch._C import ScriptObject
+from torch.futures import Future
+# This module is defined in torch/csrc/distributed/c10d/init.cpp
+_DEFAULT_FIRST_BUCKET_BYTES: int
+_DEFAULT_NO_TIMEOUT: timedelta
+_DEFAULT_PG_TIMEOUT: timedelta
+_DEFAULT_PG_NCCL_TIMEOUT: timedelta
+class BuiltinCommHookType(Enum):
+    ALLREDUCE = ...
+    FP16_COMPRESS = ...
+def _register_comm_hook(reducer: Reducer, state: Any, comm_hook: Any): ...
+def _register_builtin_comm_hook(
+    reducer: Reducer,
+    comm_hook_type: BuiltinCommHookType,
+): ...
+class GradBucket:
+    def index(self) -> int: ...
+    def buffer(self) -> Tensor: ...
+    def gradients(self) -> List[Tensor]: ...
+    def is_last(self) -> bool: ...
+    def set_buffer(self, tensor: Tensor) -> None: ...
+    def parameters(self) -> List[Tensor]: ...
+class Reducer:
+    def __init__(
+        self,
+        params: List[Tensor],
+        bucket_indices: List[List[int]],
+        per_bucket_size_limits: List[int],
+        process_group: ProcessGroup,
+        expect_sparse_gradients: List[bool] = ...,
+        bucket_bytes_cap: int = ...,  # kDefaultBucketBytesCap in reducer.hpp
+        find_unused_parameters: bool = ...,
+        gradient_as_bucket_view: bool = ...,
+        param_to_name_mapping: Dict[int, str] = ...,
+        first_bucket_types_cap: int = ...,  # kDefaultFirstBucketBytes in reducer.hpp
+    ): ...
+    def prepare_for_forward(self) -> None: ...
+    def prepare_for_backward(self, output: List[Tensor]) -> None: ...
+    def get_backward_stats(self) -> List[int]: ...
+    def _install_post_backward_futures(self, futures: List[Future]) -> None: ...
+    def _rebuild_buckets(self) -> bool: ...
+    def _get_zeros_like_grad_buckets(self) -> List[GradBucket]: ...
+    def _push_all_rebuilt_params(self) -> None: ...
+    def _set_forward_pass_work_handle(
+        self,
+        work: Work,
+        use_static_world_size: bool,
+    ): ...
+    def _get_local_used_map(self) -> Tensor: ...
+    def _set_ddp_runtime_logging_sample_rate(self, sample_rate: int) -> None: ...
+    def _set_static_graph(self) -> None: ...
+    def _run_comm_hook(self, bucket: GradBucket) -> Future: ...
+    def set_logger(self, logger: Logger) -> None: ...
+    def _remove_autograd_hooks(self) -> None: ...
+    def _check_reducer_finalized(self) -> None: ...
+    def _set_sparse_metadata(self, global_unique_ids: Dict[str, Tensor]) -> None: ...
+    def _reset_state(self) -> None: ...
+    def _update_process_group(self, new_process_group: ProcessGroup) -> None: ...
+class DDPLoggingData:
+    strs_map: Dict[str, str]
+    ints_map: Dict[str, int]
+class Logger:
+    def __init__(self, reducer: Reducer): ...
+    def set_construction_data_and_log(
+        self,
+        module_name: str,
+        device_ids: List[int],
+        output_device: int,
+        broadcast_buffers: bool,
+        has_sync_bn: bool,
+        static_graph: bool,
+    ): ...
+    def set_runtime_stats_and_log(self) -> None: ...
+    def set_error_and_log(self, error: str) -> None: ...
+    def _get_ddp_logging_data(self) -> DDPLoggingData: ...
+    def _set_comm_hook_name(self, comm_hook: str) -> None: ...
+    def _set_uneven_input_join(self) -> None: ...
+    def _set_static_graph(self) -> None: ...
+def get_debug_level(): ...
+def set_debug_level(): ...
+def set_debug_level_from_env(): ...
+class DebugLevel(Enum):
+    OFF = ...
+    INFO = ...
+    DETAIL = ...
+class ReduceOp:
+    def __init__(self, op: RedOpType): ...
+    SUM: RedOpType = ...
+    AVG: RedOpType = ...
+    PRODUCT: RedOpType = ...
+    MIN: RedOpType = ...
+    MAX: RedOpType = ...
+    BAND: RedOpType = ...
+    BOR: RedOpType = ...
+    BXOR: RedOpType = ...
+    PREMUL_SUM: RedOpType = ...
+    UNUSED: RedOpType = ...
+    class RedOpType(Enum): ...
+class BroadcastOptions:
+    rootRank: int
+    rootTensor: int
+    timeout: timedelta
+    asyncOp: bool
+class AllreduceOptions:
+    reduceOp: ReduceOp
+    timeout: timedelta
+class AllreduceCoalescedOptions(AllreduceOptions): ...
+class ReduceOptions:
+    reduceOp: ReduceOp
+    rootRank: int
+    rootTensor: int
+    timeout: timedelta
+class AllgatherOptions:
+    timeout: timedelta
+    asyncOp: bool
+class GatherOptions:
+    rootRank: int
+    timeout: timedelta
+class ScatterOptions:
+    rootRank: int
+    timeout: timedelta
+    asyncOp: bool
+class ReduceScatterOptions:
+    reduceOp: ReduceOp
+    timeout: timedelta
+    asyncOp: bool
+class BarrierOptions:
+    device_ids: List[int]
+    timeout: timedelta
+class AllToAllOptions:
+    timeout: timedelta
+class Store:
+    def set(self, key: str, value: str): ...
+    def get(self, key: str) -> bytes: ...
+    def add(self, key: str, value: int) -> int: ...
+    def compare_set(
+        self,
+        key: str,
+        expected_value: str,
+        desired_value: str,
+    ) -> bytes: ...
+    def delete_key(self, key: str) -> bool: ...
+    def num_keys(self) -> int: ...
+    def set_timeout(self, timeout: timedelta): ...
+    @overload
+    def wait(self, keys: List[str]): ...
+    @overload
+    def wait(self, keys: List[str], timeout: timedelta): ...
+class FileStore(Store):
+    def __init__(self, path: str, numWorkers: int = ...): ...
+class HashStore(Store):
+    def __init__(self): ...
+class TCPStore(Store):
+    def __init__(
+        self,
+        host_name: str,
+        port: int,
+        world_size: Optional[int] = ...,
+        is_master: bool = ...,
+        timeout: timedelta = ...,
+        wait_for_workers: bool = ...,
+        multi_tenant: bool = ...,
+        master_listen_fd: Optional[int] = ...,
+        use_libuv: Optional[bool] = ...,
+    ): ...
+    @property
+    def host(self) -> str: ...
+    @property
+    def port(self) -> int: ...
+class PrefixStore(Store):
+    def __init__(self, prefix: str, store: Store): ...
+    @property
+    def underlying_store(self) -> Store: ...
+class Work:
+    def is_completed(self) -> bool: ...
+    def is_success(self) -> bool: ...
+    def exception(self) -> Any: ...
+    def wait(self, timeout: timedelta = ...) -> bool: ...
+    def source_rank(self) -> int: ...
+    def _source_rank(self) -> int: ...
+    def result(self) -> List[Tensor]: ...
+    def synchronize(self): ...
+    def boxed(self) -> ScriptObject: ...
+    @staticmethod
+    def unbox(obj: ScriptObject) -> Work: ...
+class ProcessGroup:
+    class Options: ...
+    def __init__(self): ...
+    def rank(self) -> int: ...
+    def size(self) -> int: ...
+    @overload
+    def broadcast(
+        self,
+        tensors: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def broadcast(
+        self,
+        tensor: Tensor,
+        root: int,
+    ) -> Work: ...
+    @overload
+    def allreduce(
+        self,
+        tensors: List[Tensor],
+        opts: AllreduceOptions = ...,
+    ) -> Work: ...
+    @overload
+    def allreduce(
+        self,
+        tensors: List[Tensor],
+        op=...,
+    ) -> Work: ...
+    @overload
+    def allreduce(
+        self,
+        tensor: Tensor,
+        op=...,
+    ) -> Work: ...
+    def allreduce_coalesced(
+        self,
+        tensors: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def reduce(
+        self,
+        tensors: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def reduce(
+        self,
+        tensor: Tensor,
+        root: int,
+        op=...,
+    ) -> Work: ...
+    @overload
+    def allgather(
+        self,
+        output_tensors: List[List[Tensor]],
+        input_tensors: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def allgather(
+        self,
+        output_tensors: List[Tensor],
+        input_tensor: Tensor,
+    ) -> Work: ...
+    def _allgather_base(
+        self,
+        output: Tensor,
+        input: Tensor,
+        opts=...,
+    ) -> Work: ...
+    def allgather_coalesced(
+        self,
+        output_lists: List[List[Tensor]],
+        input_list: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def gather(
+        self,
+        output_tensors: List[List[Tensor]],
+        input_tensors: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def gather(
+        self,
+        output_tensors: List[Tensor],
+        input_tensor: Tensor,
+        root: int,
+    ) -> Work: ...
+    @overload
+    def scatter(
+        self,
+        output_tensors: List[Tensor],
+        input_tensors: List[List[Tensor]],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def scatter(
+        self,
+        output_tensor: Tensor,
+        input_tensors: List[Tensor],
+        root: int,
+    ) -> Work: ...
+    @overload
+    def reduce_scatter(
+        self,
+        output_tensors: List[Tensor],
+        input_tensors: List[List[Tensor]],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def reduce_scatter(
+        self,
+        output_tensors: Tensor,
+        input_tensor: List[Tensor],
+    ) -> Work: ...
+    def _reduce_scatter_base(
+        self,
+        outputTensor: Tensor,
+        inputTensor: Tensor,
+    ) -> Work: ...
+    @overload
+    def alltoall_base(
+        self,
+        output_tensor: Tensor,
+        input_tensor: Tensor,
+        output_split_sizes: List[int],
+        input_split_sizes: List[int],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def alltoall_base(
+        self,
+        output: Tensor,
+        input: Tensor,
+        output_split_sizes: List[int],
+        input_split_sizes: List[int],
+    ) -> Work: ...
+    @overload
+    def alltoall(
+        self,
+        output_tensor: List[Tensor],
+        input_tensor: List[Tensor],
+        opts=...,
+    ) -> Work: ...
+    @overload
+    def alltoall(
+        self,
+        output: List[Tensor],
+        input: List[Tensor],
+    ) -> Work: ...
+    def send(
+        self,
+        tensors: List[Tensor],
+        dstRank: int,
+        tag: int,
+    ) -> Work: ...
+    def recv(
+        self,
+        tensors: List[Tensor],
+        srcRank: int,
+        tag: int,
+    ) -> Work: ...
+    def recv_anysource(self, tensors: List[Tensor], tag: int) -> Work: ...
+    def barrier(self, opts=...) -> Work: ...
+    def boxed(self) -> ScriptObject: ...
+    @staticmethod
+    def unbox(obj: ScriptObject) -> ProcessGroup: ...
+class ProcessGroupRoundRobin(ProcessGroup): ...
+def _round_robin_process_groups(
+    process_groups: List[ProcessGroup],
+) -> ProcessGroupRoundRobin: ...
+class ProcessGroupGloo(ProcessGroup):
+    class Device: ...
+    class Options: ...
+    def __init__(
+        self,
+        store: Store,
+        rank: int,
+        size: int,
+        timeout: timedelta,
+    ): ...
+    @staticmethod
+    def create_device(hostname="", interface="") -> Device: ...
+    @staticmethod
+    def create_default_device() -> Device: ...
+class _ProcessGroupWrapper(ProcessGroup):
+    def __init__(self, pg: ProcessGroup, gloo_pg: ProcessGroupGloo): ...
+    wrapped_pg: ProcessGroup
+class ProcessGroupNCCL(ProcessGroup):
+    class Options: ...
+    def __init__(
+        self,
+        store: Store,
+        rank: int,
+        size: int,
+        timeout: timedelta,
+    ): ...
+    def _group_start(self) -> None: ...
+    def _group_end(self) -> None: ...
+class ProcessGroupUCC(ProcessGroup):
+    def __init__(
+        self,
+        store: Store,
+        rank: int,
+        size: int,
+        timeout: timedelta,
+    ): ...
+class ProcessGroupMPI(ProcessGroup):
+    def __init__(
+        self,
+        rank: int,
+        size: int,
+        pgComm: int,
+    ): ...
+    @staticmethod
+    def create(ranks: List[int]) -> ProcessGroupMPI: ...
+def _compute_bucket_assignment_by_size(
+    tensors: List[Tensor],
+    bucket_size_limits: List[int],
+    expect_sparse_gradient: List[bool] = ...,
+    tensor_indices: List[int] = ...,
+) -> Tuple[List[List[int]], List[int]]: ...
+def _broadcast_coalesced(
+    process_group: ProcessGroup,
+    tensors: List[Tensor],
+    buffer_size: int,
+    src: int,
+): ...
+def _test_python_store(store: Store): ...
+def _verify_params_across_processes(
+    process_group: ProcessGroup,
+    params: List[Tensor],
+    logger: Optional[Logger],
+): ...
+def _make_nccl_premul_sum(factor: Union[float, List[Tensor]]) -> ReduceOp: ...
+class Backend:
+    def __init__(
+        self,
+        rank: int,
+        size: int,
+    ): ...

torch/_C/_distributed_rpc.pyi ADDED Viewed

	@@ -0,0 +1,188 @@

+# mypy: disable-error-code="type-arg"
+from datetime import timedelta
+from typing import Any, Dict, Generic, List, Optional, overload, Tuple, Type, TypeVar
+import torch
+from . import Future
+from ._autograd import ProfilerEvent
+from ._distributed_c10d import Store
+from ._profiler import ProfilerConfig
+# This module is defined in torch/csrc/distributed/rpc/init.cpp
+_DEFAULT_INIT_METHOD: str
+_DEFAULT_NUM_WORKER_THREADS: int
+_UNSET_RPC_TIMEOUT: float
+_DEFAULT_RPC_TIMEOUT_SEC: float
+_T = TypeVar("_T")
+class RpcBackendOptions:
+    rpc_timeout: float
+    init_method: str
+    def __init__(
+        self,
+        rpc_timeout: float = ...,
+        init_method: str = ...,
+    ): ...
+class WorkerInfo:
+    def __init__(self, name: str, worker_id: int): ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def id(self) -> int: ...
+    def __eq__(self, other: object) -> bool: ...
+class RpcAgent:
+    def join(self, shutdown: bool = False, timeout: float = 0): ...
+    def sync(self): ...
+    def shutdown(self): ...
+    @overload
+    def get_worker_info(self) -> WorkerInfo: ...
+    @overload
+    def get_worker_info(self, workerName: str) -> WorkerInfo: ...
+    def get_worker_infos(self) -> List[WorkerInfo]: ...
+    def _get_device_map(self, dst: WorkerInfo) -> Dict[torch.device, torch.device]: ...
+    def get_debug_info(self) -> Dict[str, str]: ...
+    def get_metrics(self) -> Dict[str, str]: ...
+class PyRRef(Generic[_T]):
+    def __init__(self, value: _T, type_hint: Any = None) -> None: ...
+    def is_owner(self) -> bool: ...
+    def confirmed_by_owner(self) -> bool: ...
+    def owner(self) -> WorkerInfo: ...
+    def owner_name(self) -> str: ...
+    def to_here(self, timeout: float = ...) -> _T: ...
+    def local_value(self) -> Any: ...
+    def rpc_sync(self, timeout: float = ...) -> Any: ...
+    def rpc_async(self, timeout: float = ...) -> Any: ...
+    def remote(self, timeout: float = ...) -> Any: ...
+    def _serialize(self) -> Tuple: ...
+    @staticmethod
+    def _deserialize(tp: Tuple) -> PyRRef: ...
+    def _get_type(self) -> Type[_T]: ...
+    def _get_future(self) -> Future[_T]: ...
+    def _get_profiling_future(self) -> Future[_T]: ...
+    def _set_profiling_future(self, profilingFuture: Future[_T]): ...
+class _TensorPipeRpcBackendOptionsBase(RpcBackendOptions):
+    num_worker_threads: int
+    device_maps: Dict[str, Dict[torch.device, torch.device]]
+    devices: List[torch.device]
+    def __init__(
+        self,
+        num_worker_threads: int,
+        _transports: Optional[List],
+        _channels: Optional[List],
+        rpc_timeout: float = ...,
+        init_method: str = ...,
+        device_maps: Dict[str, Dict[torch.device, torch.device]] = {},  # noqa: B006
+        devices: List[torch.device] = [],  # noqa: B006
+    ): ...
+    def _set_device_map(
+        self,
+        to: str,
+        device_map: Dict[torch.device, torch.device],
+    ): ...
+class TensorPipeAgent(RpcAgent):
+    def __init__(
+        self,
+        store: Store,
+        name: str,
+        worker_id: int,
+        world_size: Optional[int],
+        opts: _TensorPipeRpcBackendOptionsBase,
+        reverse_device_maps: Dict[str, Dict[torch.device, torch.device]],
+        devices: List[torch.device],
+    ): ...
+    def join(self, shutdown: bool = False, timeout: float = 0): ...
+    def shutdown(self): ...
+    @overload
+    def get_worker_info(self) -> WorkerInfo: ...
+    @overload
+    def get_worker_info(self, workerName: str) -> WorkerInfo: ...
+    @overload
+    def get_worker_info(self, id: int) -> WorkerInfo: ...
+    def get_worker_infos(self) -> List[WorkerInfo]: ...
+    def _get_device_map(self, dst: WorkerInfo) -> Dict[torch.device, torch.device]: ...
+    def _update_group_membership(
+        self,
+        worker_info: WorkerInfo,
+        my_devices: List[torch.device],
+        reverse_device_map: Dict[str, Dict[torch.device, torch.device]],
+        is_join: bool,
+    ): ...
+    def _get_backend_options(self) -> _TensorPipeRpcBackendOptionsBase: ...
+    @property
+    def is_static_group(self) -> bool: ...
+    @property
+    def store(self) -> Store: ...
+def _is_current_rpc_agent_set() -> bool: ...
+def _get_current_rpc_agent() -> RpcAgent: ...
+def _set_and_start_rpc_agent(agent: RpcAgent): ...
+def _reset_current_rpc_agent(): ...
+def _delete_all_user_and_unforked_owner_rrefs(timeout: timedelta = ...): ...
+def _destroy_rref_context(ignoreRRefLeak: bool): ...
+def _rref_context_get_debug_info() -> Dict[str, str]: ...
+def _cleanup_python_rpc_handler(): ...
+def _invoke_rpc_builtin(
+    dst: WorkerInfo,
+    opName: str,
+    rpcTimeoutSeconds: float,
+    *args: Any,
+    **kwargs: Any,
+): ...
+def _invoke_rpc_python_udf(
+    dst: WorkerInfo,
+    pickledPythonUDF: str,
+    tensors: List[torch.Tensor],
+    rpcTimeoutSeconds: float,
+    isAsyncExecution: bool,
+): ...
+def _invoke_rpc_torchscript(
+    dstWorkerName: str,
+    qualifiedNameStr: str,
+    argsTuple: Tuple,
+    kwargsDict: Dict,
+    rpcTimeoutSeconds: float,
+    isAsyncExecution: bool,
+): ...
+def _invoke_remote_builtin(
+    dst: WorkerInfo,
+    opName: str,
+    rpcTimeoutSeconds: float,
+    *args: Any,
+    **kwargs: Any,
+): ...
+def _invoke_remote_python_udf(
+    dst: WorkerInfo,
+    pickledPythonUDF: str,
+    tensors: List[torch.Tensor],
+    rpcTimeoutSeconds: float,
+    isAsyncExecution: bool,
+): ...
+def _invoke_remote_torchscript(
+    dstWorkerName: WorkerInfo,
+    qualifiedNameStr: str,
+    rpcTimeoutSeconds: float,
+    isAsyncExecution: bool,
+    *args: Any,
+    **kwargs: Any,
+): ...
+def get_rpc_timeout() -> float: ...
+def enable_gil_profiling(flag: bool): ...
+def _set_rpc_timeout(rpcTimeoutSeconds: float): ...
+class RemoteProfilerManager:
+    @staticmethod
+    def set_current_profiling_key(key: str): ...
+def _enable_server_process_global_profiler(new_config: ProfilerConfig): ...
+def _disable_server_process_global_profiler() -> List[List[List[ProfilerEvent]]]: ...
+def _set_profiler_node_id(default_node_id: int): ...
+def _enable_jit_rref_pickle(): ...
+def _disable_jit_rref_pickle(): ...

torch/_C/_distributed_rpc_testing.pyi ADDED Viewed

	@@ -0,0 +1,35 @@

+from typing import Dict, List
+import torch
+from ._distributed_c10d import Store
+from ._distributed_rpc import _TensorPipeRpcBackendOptionsBase, TensorPipeAgent
+# This module is defined in torch/csrc/distributed/rpc/testing/init.cpp
+class FaultyTensorPipeRpcBackendOptions(_TensorPipeRpcBackendOptionsBase):
+    def __init__(
+        self,
+        num_worker_threads: int,
+        rpc_timeout: float,
+        init_method: str,
+        messages_to_fail: List[str],
+        messages_to_delay: Dict[str, float],
+        num_fail_sends: int,
+    ): ...
+    num_send_recv_threads: int
+    messages_to_fail: List[str]
+    messages_to_delay: Dict[str, float]
+    num_fail_sends: int
+class FaultyTensorPipeAgent(TensorPipeAgent):
+    def __init__(
+        self,
+        store: Store,
+        name: str,
+        rank: int,
+        world_size: int,
+        options: FaultyTensorPipeRpcBackendOptions,
+        reverse_device_maps: Dict[str, Dict[torch.device, torch.device]],
+        devices: List[torch.device],
+    ): ...

torch/_C/_functions.pyi ADDED Viewed

	@@ -0,0 +1,11 @@

+from typing import AnyStr, List
+from torch import Tensor
+class UndefinedGrad:
+    def __init__(self) -> None: ...
+    def __call__(self, *inputs: Tensor) -> List[Tensor]: ...
+class DelayedError:
+    def __init__(self, msg: AnyStr, num_inputs: int) -> None: ...
+    def __call__(self, inputs: List[Tensor]) -> List[Tensor]: ...

torch/_C/_functorch.pyi ADDED Viewed

	@@ -0,0 +1,71 @@

+from enum import Enum
+from typing import Optional, Tuple
+from torch import Tensor
+# Defined in torch/csrc/functorch/init.cpp
+def _set_dynamic_layer_keys_included(included: bool) -> None: ...
+def get_unwrapped(tensor: Tensor) -> Tensor: ...
+def is_batchedtensor(tensor: Tensor) -> bool: ...
+def is_functionaltensor(tensor: Tensor) -> bool: ...
+def is_functorch_wrapped_tensor(tensor: Tensor) -> bool: ...
+def is_gradtrackingtensor(tensor: Tensor) -> bool: ...
+def maybe_get_bdim(tensor: Tensor) -> int: ...
+def maybe_get_level(tensor: Tensor) -> int: ...
+def unwrap_if_dead(tensor: Tensor) -> Tensor: ...
+def _unwrap_for_grad(tensor: Tensor, level: int) -> Tensor: ...
+def _wrap_for_grad(tensor: Tensor, level: int) -> Tensor: ...
+def _unwrap_batched(tensor: Tensor, level: int) -> Tuple[Tensor, Optional[int]]: ...
+def current_level() -> int: ...
+def _add_batch_dim(tensor: Tensor, bdim: int, level: int) -> Tensor: ...
+def set_single_level_autograd_function_allowed(allowed: bool) -> None: ...
+def get_single_level_autograd_function_allowed() -> bool: ...
+def _unwrap_functional_tensor(tensor: Tensor, reapply_views: bool) -> Tensor: ...
+def _wrap_functional_tensor(tensor: Tensor, level: int) -> Tensor: ...
+# Defined in aten/src/ATen/functorch/Interpreter.h
+class TransformType(Enum):
+    Torch: TransformType = ...
+    Vmap: TransformType = ...
+    Grad: TransformType = ...
+    Jvp: TransformType = ...
+    Functionalize: TransformType = ...
+class RandomnessType(Enum):
+    Error: TransformType = ...
+    Same: TransformType = ...
+    Different: TransformType = ...
+class CInterpreter:
+    def key(self) -> TransformType: ...
+    def level(self) -> int: ...
+class CGradInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter): ...
+    def lift(self, Tensor) -> Tensor: ...
+    def prevGradMode(self) -> bool: ...
+class CJvpInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter): ...
+    def lift(self, Tensor) -> Tensor: ...
+    def prevFwdGradMode(self) -> bool: ...
+class CFunctionalizeInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter): ...
+    def key(self) -> TransformType: ...
+    def level(self) -> int: ...
+    def functionalizeAddBackViews(self) -> bool: ...
+class CVmapInterpreterPtr:
+    def __init__(self, interpreter: CInterpreter): ...
+    def key(self) -> TransformType: ...
+    def level(self) -> int: ...
+    def batchSize(self) -> int: ...
+    def randomness(self) -> RandomnessType: ...
+class DynamicLayer: ...
+def peek_interpreter_stack() -> CInterpreter: ...
+def pop_dynamic_layer_stack() -> DynamicLayer: ...
+def push_dynamic_layer_stack(dl: DynamicLayer) -> int: ...

torch/_C/_itt.pyi ADDED Viewed

	@@ -0,0 +1,5 @@

+# Defined in torch/csrc/itt.cpp
+def is_available() -> None: ...
+def rangePush(message: str) -> None: ...
+def rangePop() -> None: ...
+def mark(message: str) -> None: ...

torch/_C/_lazy.pyi ADDED Viewed

	@@ -0,0 +1,28 @@

+from typing import List
+from torch import Tensor
+# defined in torch/csrc/lazy/python/init.cpp
+def _mark_step(device: str, devices: List[str], wait: bool): ...
+def _wait_device_ops(devices: List[str]): ...
+def _reset_metrics(): ...
+def _counter_names() -> List[str]: ...
+def _counter_value(name: str) -> int: ...
+def _metrics_report() -> str: ...
+def _get_graph_hash(tensors: List[Tensor]) -> str: ...
+def _sync_multi(
+    tensors: List[Tensor],
+    devices: List[str],
+    wait: bool = True,
+    sync_ltc_data: bool = True,
+): ...
+def _get_tensor_id(tensor: Tensor) -> int: ...
+def _get_tensors_text(tensors: List[Tensor]) -> str: ...
+def _get_tensors_dot(tensors: List[Tensor]) -> str: ...
+def _get_tensors_backend(tensors: List[Tensor]) -> str: ...
+def _get_force_fallback() -> str: ...
+def _set_force_fallback(newval: str): ...
+def _clear_ir_cache(): ...
+def _dump_ir_cache(filename: str): ...
+def _set_reuse_ir(val: bool): ...
+def _get_default_device_type(): ...

torch/_C/_lazy_ts_backend.pyi ADDED Viewed

	@@ -0,0 +1,11 @@

+# defined in torch/csrc/lazy/python/init.cpp
+from typing import Any, List, Tuple
+from torch import Tensor
+def _init(): ...
+def _get_tensors_ts_device_data_node(
+    tensors: List[Tensor],
+) -> Tuple[List[int], List[Any]]: ...
+def _run_cached_graph(hash_str: str, graph_inputs: List[Any]) -> List[Tensor]: ...

torch/_C/_monitor.pyi ADDED Viewed

	@@ -0,0 +1,44 @@

+# Defined in torch/csrc/monitor/python_init.cpp
+import datetime
+from enum import Enum
+from typing import Callable, Dict, List, Union
+class Aggregation(Enum):
+    VALUE = ...
+    MEAN = ...
+    COUNT = ...
+    SUM = ...
+    MAX = ...
+    MIN = ...
+class Stat:
+    name: str
+    count: int
+    def __init__(
+        self,
+        name: str,
+        aggregations: List[Aggregation],
+        window_size: int,
+        max_samples: int = -1,
+    ) -> None: ...
+    def add(self, v: float) -> None: ...
+    def get(self) -> Dict[Aggregation, float]: ...
+class Event:
+    name: str
+    timestamp: datetime.datetime
+    data: Dict[str, Union[int, float, bool, str]]
+    def __init__(
+        self,
+        name: str,
+        timestamp: datetime.datetime,
+        data: Dict[str, Union[int, float, bool, str]],
+    ) -> None: ...
+def log_event(e: Event) -> None: ...
+class EventHandlerHandle: ...
+def register_event_handler(handler: Callable[[Event], None]) -> EventHandlerHandle: ...
+def unregister_event_handler(handle: EventHandlerHandle) -> None: ...

torch/_C/_nn.pyi ADDED Viewed

	@@ -0,0 +1,86 @@

+# mypy: disable-error-code="type-arg"
+from typing import List, Optional, overload, Sequence, Tuple, Union
+from torch import memory_format, Tensor
+from torch.types import _bool, _device, _dtype, _int, _size
+# Defined in tools/autograd/templates/python_nn_functions.cpp
+def adaptive_max_pool2d(input: Tensor, output_size: Union[_int, _size]) -> Tuple[Tensor, Tensor]: ...
+def adaptive_max_pool3d(input: Tensor, output_size: Union[_int, _size]) -> Tuple[Tensor, Tensor]: ...
+def avg_pool2d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]] = None, padding: Union[_int, _size] = 0, ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> Tensor: ...
+def avg_pool3d(input: Tensor, kernel_size: Union[_int, _size], stride: Optional[Union[_int, _size]] = None, padding: Union[_int, _size] = 0, ceil_mode: bool = False, count_include_pad: bool = True, divisor_override: Optional[int] = None) -> Tensor: ...
+def elu_(input: Tensor, alpha: float = ...) -> Tensor: ...
+def fractional_max_pool2d(input: Tensor, kernel_size: Union[_int, _size], output_size: Union[_int, _size], _random_samples: Tensor) -> Tuple[Tensor, Tensor]: ...
+def fractional_max_pool3d(input: Tensor, kernel_size: Union[_int, _size], output_size: Union[_int, _size], _random_samples: Tensor) -> Tuple[Tensor, Tensor]: ...
+def gelu(input: Tensor, approximate: str = ...) -> Tensor: ...
+def hardsigmoid(input: Tensor, *, out: Optional[Tensor] = None) -> Tensor: ...
+def hardtanh(input: Tensor, min_val: float = ..., max_val: float = ..., *, out: Optional[Tensor] = None) -> Tensor: ...
+def hardtanh_(input: Tensor, min_val: float = ..., max_val: float = ...) -> Tensor: ...
+def leaky_relu(input: Tensor, negative_slope: float = ..., *, out: Optional[Tensor] = None) -> Tensor: ...
+def leaky_relu_(input: Tensor, negative_slope: float = ...) -> Tensor: ...
+def linear(input: Tensor, weight: Tensor, bias: Optional[Tensor] = None) -> Tensor: ...
+def log_sigmoid(input: Tensor) -> Tensor: ...
+def one_hot(tensor: Tensor, num_classes: int = ...) -> Tensor: ...
+def pad(input: Tensor, pad: Sequence[int], mode: str = ..., value: Optional[float] = None) -> Tensor: ...
+def scaled_dot_product_attention(query: Tensor, key: Tensor, value: Tensor, attn_mask: Optional[Tensor] = None, dropout_p: float = 0.0, is_causal: bool = False, scale: Optional[float] = None) -> Tensor: ...
+def softplus(input: Tensor, beta: int = ..., threshold: int = ...) -> Tensor: ...
+def softshrink(input: Tensor, lambd: float = ...) -> Tensor: ...
+# Defined in aten/src/ATen/native/mkldnn/Linear.cpp
+def mkldnn_linear(input: Tensor, weight: Tensor, bias: Optional[Tensor]) -> Tensor: ...
+# Defined at aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp
+def mkldnn_reorder_conv2d_weight(
+    self: Tensor,
+    padding: List,
+    stride: List,
+    dilatation: List,
+    groups: int,
+) -> Tensor: ...
+def mkldnn_reorder_conv3d_weight(
+    self: Tensor,
+    padding: List,
+    stride: List,
+    dilatation: List,
+    groups: int,
+) -> Tensor: ...
+# Defined in aten/src/ATen/native/mkldnn/Prelu.cpp
+def mkldnn_prelu(input: Tensor, weight: Tensor) -> Tensor: ...
+# Defined at tools/autograd/templates/python_nn_functions.cpp
+@overload
+def _parse_to(
+    device: _device,
+    dtype: _dtype,
+    non_blocking: _bool,
+    copy: _bool,
+    *,
+    memory_format: memory_format,
+) -> Tuple[_device, _dtype, _bool, memory_format]: ...
+@overload
+def _parse_to(
+    dtype: _dtype,
+    non_blocking: _bool,
+    copy: _bool,
+    *,
+    memory_format: memory_format,
+) -> Tuple[_device, _dtype, _bool, memory_format]: ...
+@overload
+def _parse_to(
+    tensor: Tensor,
+    non_blocking: _bool,
+    copy: _bool,
+    *,
+    memory_format: memory_format,
+) -> Tuple[_device, _dtype, _bool, memory_format]: ...
+# Defined in aten/src/ATen/native/PadSequence.cpp
+def pad_sequence(
+    sequences: List[Tensor],
+    batch_first: bool = False,
+    padding_value: float = ...,
+) -> Tensor: ...
+def flatten_dense_tensors(tensors: List[Tensor]) -> Tensor: ...
+def unflatten_dense_tensors(flat: Tensor, tensors: List[Tensor]) -> List[Tensor]: ...

torch/_C/_nvtx.pyi ADDED Viewed

	@@ -0,0 +1,6 @@

+# Defined in torch/csrc/cuda/shared/nvtx.cpp
+def rangePushA(message: str) -> int: ...
+def rangePop() -> int: ...
+def rangeStartA(message: str) -> int: ...
+def rangeEnd(int) -> None: ...
+def markA(message: str) -> None: ...

torch/_C/_onnx.pyi ADDED Viewed

	@@ -0,0 +1,38 @@

+# Defined in torch/csrc/onnx/init.cpp
+from enum import Enum
+_CAFFE2_ATEN_FALLBACK: bool
+PRODUCER_VERSION: str
+class TensorProtoDataType(Enum):
+    UNDEFINED = ...
+    FLOAT = ...
+    UINT8 = ...
+    INT8 = ...
+    UINT16 = ...
+    INT16 = ...
+    INT32 = ...
+    INT64 = ...
+    STRING = ...
+    BOOL = ...
+    FLOAT16 = ...
+    DOUBLE = ...
+    UINT32 = ...
+    UINT64 = ...
+    COMPLEX64 = ...
+    COMPLEX128 = ...
+    BFLOAT16 = ...
+    FLOAT8E5M2 = ...
+    FLOAT8E4M3FN = ...
+class OperatorExportTypes(Enum):
+    ONNX = ...
+    ONNX_ATEN = ...
+    ONNX_ATEN_FALLBACK = ...
+    ONNX_FALLTHROUGH = ...
+class TrainingMode(Enum):
+    EVAL = ...
+    PRESERVE = ...
+    TRAINING = ...

torch/_C/_profiler.pyi ADDED Viewed

	@@ -0,0 +1,238 @@

+from enum import Enum
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+from torch._C import device, dtype, layout
+from typing_extensions import TypeAlias
+# defined in torch/csrc/profiler/python/init.cpp
+class RecordScope(Enum):
+    FUNCTION = ...
+    BACKWARD_FUNCTION = ...
+    TORCHSCRIPT_FUNCTION = ...
+    KERNEL_FUNCTION_DTYPE = ...
+    CUSTOM_CLASS = ...
+    BUILD_FEATURE = ...
+    LITE_INTERPRETER = ...
+    USER_SCOPE = ...
+    STATIC_RUNTIME_OP = ...
+    STATIC_RUNTIME_MODEL = ...
+class ProfilerState(Enum):
+    Disable = ...
+    CPU = ...
+    CUDA = ...
+    NVTX = ...
+    ITT = ...
+    KINETO = ...
+    KINETO_GPU_FALLBACK = ...
+    KINETO_PRIVATEUSE1_FALLBACK = ...
+    KINETO_PRIVATEUSE1 = ...
+class ActiveProfilerType(Enum):
+    NONE = ...
+    LEGACY = ...
+    KINETO = ...
+    NVTX = ...
+    ITT = ...
+class ProfilerActivity(Enum):
+    CPU = ...
+    CUDA = ...
+    MTIA = ...
+    PrivateUse1 = ...
+class _EventType(Enum):
+    TorchOp = ...
+    Backend = ...
+    Allocation = ...
+    OutOfMemory = ...
+    PyCall = ...
+    PyCCall = ...
+    Kineto = ...
+class _ExperimentalConfig:
+    def __init__(
+        self,
+        profiler_metrics: List[str] = ...,
+        profiler_measure_per_kernel: bool = ...,
+        verbose: bool = ...,
+        performance_events: List[str] = ...,
+        enable_cuda_sync_events: bool = ...,
+    ) -> None: ...
+class ProfilerConfig:
+    def __init__(
+        self,
+        state: ProfilerState,
+        report_input_shapes: bool,
+        profile_memory: bool,
+        with_stack: bool,
+        with_flops: bool,
+        with_modules: bool,
+        experimental_config: _ExperimentalConfig,
+    ) -> None: ...
+class _ProfilerEvent:
+    start_tid: int
+    start_time_ns: int
+    children: List[_ProfilerEvent]
+    # TODO(robieta): remove in favor of `self.typed`
+    extra_fields: Union[
+        _ExtraFields_TorchOp,
+        _ExtraFields_Backend,
+        _ExtraFields_Allocation,
+        _ExtraFields_OutOfMemory,
+        _ExtraFields_PyCall,
+        _ExtraFields_PyCCall,
+        _ExtraFields_Kineto,
+    ]
+    @property
+    def typed(
+        self,
+    ) -> Union[
+        Tuple[Literal[_EventType.TorchOp], _ExtraFields_TorchOp],
+        Tuple[Literal[_EventType.Backend], _ExtraFields_Backend],
+        Tuple[Literal[_EventType.Allocation], _ExtraFields_Allocation],
+        Tuple[Literal[_EventType.OutOfMemory], _ExtraFields_OutOfMemory],
+        Tuple[Literal[_EventType.PyCall], _ExtraFields_PyCall],
+        Tuple[Literal[_EventType.PyCCall], _ExtraFields_PyCCall],
+        Tuple[Literal[_EventType.Kineto], _ExtraFields_Kineto],
+    ]: ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def tag(self) -> _EventType: ...
+    @property
+    def id(self) -> int: ...
+    @property
+    def parent(self) -> Optional[_ProfilerEvent]: ...
+    @property
+    def correlation_id(self) -> int: ...
+    @property
+    def end_time_ns(self) -> int: ...
+    @property
+    def duration_time_ns(self) -> int: ...
+class _TensorMetadata:
+    impl_ptr: Optional[int]
+    storage_data_ptr: Optional[int]
+    id: Optional[int]
+    @property
+    def allocation_id(self) -> Optional[int]: ...
+    @property
+    def layout(self) -> layout: ...
+    @property
+    def device(self) -> device: ...
+    @property
+    def dtype(self) -> dtype: ...
+    @property
+    def sizes(self) -> List[int]: ...
+    @property
+    def strides(self) -> List[int]: ...
+Scalar: TypeAlias = Union[int, float, bool, complex]
+Input: TypeAlias = Optional[Union[_TensorMetadata, List[_TensorMetadata], Scalar]]
+class _ExtraFields_TorchOp:
+    name: str
+    sequence_number: int
+    allow_tf32_cublas: bool
+    @property
+    def inputs(self) -> List[Input]: ...
+    @property
+    def scope(self) -> RecordScope: ...
+class _ExtraFields_Backend: ...
+class _ExtraFields_Allocation:
+    ptr: int
+    id: Optional[int]
+    alloc_size: int
+    total_allocated: int
+    total_reserved: int
+    @property
+    def allocation_id(self) -> Optional[int]: ...
+    @property
+    def device(self) -> device: ...
+class _ExtraFields_OutOfMemory: ...
+class _PyFrameState:
+    line_number: int
+    function_name: str
+    @property
+    def file_name(self) -> str: ...
+class _NNModuleInfo:
+    @property
+    def self_ptr(self) -> int: ...
+    @property
+    def cls_ptr(self) -> int: ...
+    @property
+    def cls_name(self) -> str: ...
+    @property
+    def parameters(
+        self,
+    ) -> List[Tuple[str, _TensorMetadata, Optional[_TensorMetadata]]]: ...
+class _OptimizerInfo:
+    @property
+    def parameters(
+        self,
+    ) -> List[
+        Tuple[
+            # Parameter
+            _TensorMetadata,
+            #
+            # Gradient (if present during optimizer.step())
+            Optional[_TensorMetadata],
+            #
+            # Optimizer state for Parameter as (name, tensor) pairs
+            List[Tuple[str, _TensorMetadata]],
+        ]
+    ]: ...
+class _ExtraFields_PyCCall:
+    @property
+    def caller(self) -> _PyFrameState: ...
+class _ExtraFields_PyCall:
+    @property
+    def callsite(self) -> _PyFrameState: ...
+    @property
+    def caller(self) -> _PyFrameState: ...
+    @property
+    def module(self) -> Optional[_NNModuleInfo]: ...
+    @property
+    def optimizer(self) -> Optional[_OptimizerInfo]: ...
+class _ExtraFields_Kineto: ...
+def _add_execution_trace_observer(output_file_path: str) -> bool: ...
+def _remove_execution_trace_observer() -> None: ...
+def _enable_execution_trace_observer() -> None: ...
+def _disable_execution_trace_observer() -> None: ...
+def _set_record_concrete_inputs_enabled_val(val: bool) -> None: ...
+def _set_fwd_bwd_enabled_val(val: bool) -> None: ...
+def _set_cuda_sync_enabled_val(val: bool) -> None: ...
+class CapturedTraceback: ...
+def gather_traceback(python: bool, script: bool, cpp: bool) -> CapturedTraceback: ...
+# The Dict has name, filename, line
+def symbolize_tracebacks(
+    to_symbolize: List[CapturedTraceback],
+) -> List[List[Dict[str, str]]]: ...
+class _RecordFunctionFast:
+    def __init__(self, name: str) -> None: ...
+    def __enter__(self) -> None: ...
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None: ...

torch/_C/_verbose.pyi ADDED Viewed

	@@ -0,0 +1,3 @@

+# Defined in torch/csrc/utils/verbose.cpp
+def mkl_set_verbose(enable: int) -> int: ...
+def mkldnn_set_verbose(level: int) -> int: ...

torch/_VF.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+This makes the functions in torch._C._VariableFunctions available as
+    torch._VF.<funcname>
+without mypy being able to find them.
+A subset of those functions are mapped to ATen functions in
+torch/jit/_builtins.py
+See https://github.com/pytorch/pytorch/issues/21478 for the reason for
+introducing torch._VF
+"""
+import sys
+import types
+import torch
+class VFModule(types.ModuleType):
+    vf: types.ModuleType
+    def __init__(self, name):
+        super().__init__(name)
+        self.vf = torch._C._VariableFunctions
+    def __getattr__(self, attr):
+        return getattr(self.vf, attr)
+sys.modules[__name__] = VFModule(__name__)

torch/_VF.pyi ADDED Viewed

The diff for this file is too large to render. See raw diff

torch/__config__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import torch
+def show():
+    """
+    Return a human-readable string with descriptions of the
+    configuration of PyTorch.
+    """
+    return torch._C._show_config()
+# TODO: In principle, we could provide more structured version/config
+# information here. For now only CXX_FLAGS is exposed, as Timer
+# uses them.
+def _cxx_flags():
+    """Returns the CXX_FLAGS used when building PyTorch."""
+    return torch._C._cxx_flags()
+def parallel_info():
+    r"""Returns detailed string with parallelization settings"""
+    return torch._C._parallel_info()

torch/__future__.py ADDED Viewed

	@@ -0,0 +1,21 @@

+"""
+This global flag controls whether to assign new tensors to the parameters
+instead of changing the existing parameters in-place when converting an `nn.Module`
+using the following methods:
+1. `module.cuda()` / `.cpu()` (for moving `module` between devices)
+2. `module.float()` / `.double()` / `.half()` (for converting `module` to a different dtype)
+3. `module.to()` / `.type()` (for changing `module`'s device or dtype)
+4. `module._apply(fn)` (for generic functions applied to `module`)
+Default: False
+"""
+_overwrite_module_params_on_conversion = False
+def set_overwrite_module_params_on_conversion(value):
+    global _overwrite_module_params_on_conversion
+    _overwrite_module_params_on_conversion = value
+def get_overwrite_module_params_on_conversion():
+    return _overwrite_module_params_on_conversion

torch/_appdirs.py ADDED Viewed

	@@ -0,0 +1,666 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) 2005-2010 ActiveState Software Inc.
+# Copyright (c) 2013 Eddy Petrișor
+# flake8: noqa
+"""
+This file is directly from
+https://github.com/ActiveState/appdirs/blob/3fe6a83776843a46f20c2e5587afcffe05e03b39/appdirs.py
+The license of https://github.com/ActiveState/appdirs copied below:
+# This is the MIT license
+Copyright (c) 2010 ActiveState Software Inc.
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+"""
+"""Utilities for determining application-specific dirs.
+See <https://github.com/ActiveState/appdirs> for details and usage.
+"""
+# Dev Notes:
+# - MSDN on where to store app data files:
+#   http://support.microsoft.com/default.aspx?scid=kb;en-us;310294#XSLTH3194121123120121120120
+# - Mac OS X: http://developer.apple.com/documentation/MacOSX/Conceptual/BPFileSystem/index.html
+# - XDG spec for Un*x: https://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
+__version__ = "1.4.4"
+__version_info__ = tuple(int(segment) for segment in __version__.split("."))
+import os
+import sys
+unicode = str
+if sys.platform.startswith("java"):
+    import platform
+    os_name = platform.java_ver()[3][0]
+    if os_name.startswith("Windows"):  # "Windows XP", "Windows 7", etc.
+        system = "win32"
+    elif os_name.startswith("Mac"):  # "Mac OS X", etc.
+        system = "darwin"
+    else:  # "Linux", "SunOS", "FreeBSD", etc.
+        # Setting this to "linux2" is not ideal, but only Windows or Mac
+        # are actually checked for and the rest of the module expects
+        # *sys.platform* style strings.
+        system = "linux2"
+else:
+    system = sys.platform
+def user_data_dir(appname=None, appauthor=None, version=None, roaming=False):
+    r"""Return full path to the user-specific data dir for this application.
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "roaming" (boolean, default False) can be set True to use the Windows
+            roaming appdata directory. That means that for users on a Windows
+            network setup for roaming profiles, this user data will be
+            sync'd on login. See
+            <http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx>
+            for a discussion of issues.
+    Typical user data directories are:
+        Mac OS X:               ~/Library/Application Support/<AppName>
+        Unix:                   ~/.local/share/<AppName>    # or in $XDG_DATA_HOME, if defined
+        Win XP (not roaming):   C:\Documents and Settings\<username>\Application Data\<AppAuthor>\<AppName>
+        Win XP (roaming):       C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>
+        Win 7  (not roaming):   C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>
+        Win 7  (roaming):       C:\Users\<username>\AppData\Roaming\<AppAuthor>\<AppName>
+    For Unix, we follow the XDG spec and support $XDG_DATA_HOME.
+    That means, by default "~/.local/share/<AppName>".
+    """
+    if system == "win32":
+        if appauthor is None:
+            appauthor = appname
+        const = roaming and "CSIDL_APPDATA" or "CSIDL_LOCAL_APPDATA"
+        path = os.path.normpath(_get_win_folder(const))
+        if appname:
+            if appauthor is not False:
+                path = os.path.join(path, appauthor, appname)
+            else:
+                path = os.path.join(path, appname)
+    elif system == "darwin":
+        path = os.path.expanduser("~/Library/Application Support/")
+        if appname:
+            path = os.path.join(path, appname)
+    else:
+        path = os.getenv("XDG_DATA_HOME", os.path.expanduser("~/.local/share"))
+        if appname:
+            path = os.path.join(path, appname)
+    if appname and version:
+        path = os.path.join(path, version)
+    return path
+def site_data_dir(appname=None, appauthor=None, version=None, multipath=False):
+    r"""Return full path to the user-shared data dir for this application.
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "multipath" is an optional parameter only applicable to *nix
+            which indicates that the entire list of data dirs should be
+            returned. By default, the first item from XDG_DATA_DIRS is
+            returned, or '/usr/local/share/<AppName>',
+            if XDG_DATA_DIRS is not set
+    Typical site data directories are:
+        Mac OS X:   /Library/Application Support/<AppName>
+        Unix:       /usr/local/share/<AppName> or /usr/share/<AppName>
+        Win XP:     C:\Documents and Settings\All Users\Application Data\<AppAuthor>\<AppName>
+        Vista:      (Fail! "C:\ProgramData" is a hidden *system* directory on Vista.)
+        Win 7:      C:\ProgramData\<AppAuthor>\<AppName>   # Hidden, but writeable on Win 7.
+    For Unix, this is using the $XDG_DATA_DIRS[0] default.
+    WARNING: Do not use this on Windows. See the Vista-Fail note above for why.
+    """
+    if system == "win32":
+        if appauthor is None:
+            appauthor = appname
+        path = os.path.normpath(_get_win_folder("CSIDL_COMMON_APPDATA"))
+        if appname:
+            if appauthor is not False:
+                path = os.path.join(path, appauthor, appname)
+            else:
+                path = os.path.join(path, appname)
+    elif system == "darwin":
+        path = os.path.expanduser("/Library/Application Support")
+        if appname:
+            path = os.path.join(path, appname)
+    else:
+        # XDG default for $XDG_DATA_DIRS
+        # only first, if multipath is False
+        path = os.getenv(
+            "XDG_DATA_DIRS", os.pathsep.join(["/usr/local/share", "/usr/share"])
+        )
+        pathlist = [
+            os.path.expanduser(x.rstrip(os.sep)) for x in path.split(os.pathsep)
+        ]
+        if appname:
+            if version:
+                appname = os.path.join(appname, version)
+            pathlist = [os.sep.join([x, appname]) for x in pathlist]
+        if multipath:
+            path = os.pathsep.join(pathlist)
+        else:
+            path = pathlist[0]
+        return path
+    if appname and version:
+        path = os.path.join(path, version)
+    return path
+def user_config_dir(appname=None, appauthor=None, version=None, roaming=False):
+    r"""Return full path to the user-specific config dir for this application.
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "roaming" (boolean, default False) can be set True to use the Windows
+            roaming appdata directory. That means that for users on a Windows
+            network setup for roaming profiles, this user data will be
+            sync'd on login. See
+            <http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx>
+            for a discussion of issues.
+    Typical user config directories are:
+        Mac OS X:               ~/Library/Preferences/<AppName>
+        Unix:                   ~/.config/<AppName>     # or in $XDG_CONFIG_HOME, if defined
+        Win *:                  same as user_data_dir
+    For Unix, we follow the XDG spec and support $XDG_CONFIG_HOME.
+    That means, by default "~/.config/<AppName>".
+    """
+    if system == "win32":
+        path = user_data_dir(appname, appauthor, None, roaming)
+    elif system == "darwin":
+        path = os.path.expanduser("~/Library/Preferences/")
+        if appname:
+            path = os.path.join(path, appname)
+    else:
+        path = os.getenv("XDG_CONFIG_HOME", os.path.expanduser("~/.config"))
+        if appname:
+            path = os.path.join(path, appname)
+    if appname and version:
+        path = os.path.join(path, version)
+    return path
+def site_config_dir(appname=None, appauthor=None, version=None, multipath=False):
+    r"""Return full path to the user-shared data dir for this application.
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "multipath" is an optional parameter only applicable to *nix
+            which indicates that the entire list of config dirs should be
+            returned. By default, the first item from XDG_CONFIG_DIRS is
+            returned, or '/etc/xdg/<AppName>', if XDG_CONFIG_DIRS is not set
+    Typical site config directories are:
+        Mac OS X:   same as site_data_dir
+        Unix:       /etc/xdg/<AppName> or $XDG_CONFIG_DIRS[i]/<AppName> for each value in
+                    $XDG_CONFIG_DIRS
+        Win *:      same as site_data_dir
+        Vista:      (Fail! "C:\ProgramData" is a hidden *system* directory on Vista.)
+    For Unix, this is using the $XDG_CONFIG_DIRS[0] default, if multipath=False
+    WARNING: Do not use this on Windows. See the Vista-Fail note above for why.
+    """
+    if system == "win32":
+        path = site_data_dir(appname, appauthor)
+        if appname and version:
+            path = os.path.join(path, version)
+    elif system == "darwin":
+        path = os.path.expanduser("/Library/Preferences")
+        if appname:
+            path = os.path.join(path, appname)
+    else:
+        # XDG default for $XDG_CONFIG_DIRS
+        # only first, if multipath is False
+        path = os.getenv("XDG_CONFIG_DIRS", "/etc/xdg")
+        pathlist = [
+            os.path.expanduser(x.rstrip(os.sep)) for x in path.split(os.pathsep)
+        ]
+        if appname:
+            if version:
+                appname = os.path.join(appname, version)
+            pathlist = [os.sep.join([x, appname]) for x in pathlist]
+        if multipath:
+            path = os.pathsep.join(pathlist)
+        else:
+            path = pathlist[0]
+    return path
+def user_cache_dir(appname=None, appauthor=None, version=None, opinion=True):
+    r"""Return full path to the user-specific cache dir for this application.
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "opinion" (boolean) can be False to disable the appending of
+            "Cache" to the base app data dir for Windows. See
+            discussion below.
+    Typical user cache directories are:
+        Mac OS X:   ~/Library/Caches/<AppName>
+        Unix:       ~/.cache/<AppName> (XDG default)
+        Win XP:     C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>\Cache
+        Vista:      C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>\Cache
+    On Windows the only suggestion in the MSDN docs is that local settings go in
+    the `CSIDL_LOCAL_APPDATA` directory. This is identical to the non-roaming
+    app data dir (the default returned by `user_data_dir` above). Apps typically
+    put cache data somewhere *under* the given dir here. Some examples:
+        ...\Mozilla\Firefox\Profiles\<ProfileName>\Cache
+        ...\Acme\SuperApp\Cache\1.0
+    OPINION: This function appends "Cache" to the `CSIDL_LOCAL_APPDATA` value.
+    This can be disabled with the `opinion=False` option.
+    """
+    if system == "win32":
+        if appauthor is None:
+            appauthor = appname
+        path = os.path.normpath(_get_win_folder("CSIDL_LOCAL_APPDATA"))
+        if appname:
+            if appauthor is not False:
+                path = os.path.join(path, appauthor, appname)
+            else:
+                path = os.path.join(path, appname)
+            if opinion:
+                path = os.path.join(path, "Cache")
+    elif system == "darwin":
+        path = os.path.expanduser("~/Library/Caches")
+        if appname:
+            path = os.path.join(path, appname)
+    else:
+        path = os.getenv("XDG_CACHE_HOME", os.path.expanduser("~/.cache"))
+        if appname:
+            path = os.path.join(path, appname)
+    if appname and version:
+        path = os.path.join(path, version)
+    return path
+def user_state_dir(appname=None, appauthor=None, version=None, roaming=False):
+    r"""Return full path to the user-specific state dir for this application.
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "roaming" (boolean, default False) can be set True to use the Windows
+            roaming appdata directory. That means that for users on a Windows
+            network setup for roaming profiles, this user data will be
+            sync'd on login. See
+            <http://technet.microsoft.com/en-us/library/cc766489(WS.10).aspx>
+            for a discussion of issues.
+    Typical user state directories are:
+        Mac OS X:  same as user_data_dir
+        Unix:      ~/.local/state/<AppName>   # or in $XDG_STATE_HOME, if defined
+        Win *:     same as user_data_dir
+    For Unix, we follow this Debian proposal <https://wiki.debian.org/XDGBaseDirectorySpecification#state>
+    to extend the XDG spec and support $XDG_STATE_HOME.
+    That means, by default "~/.local/state/<AppName>".
+    """
+    if system in ["win32", "darwin"]:
+        path = user_data_dir(appname, appauthor, None, roaming)
+    else:
+        path = os.getenv("XDG_STATE_HOME", os.path.expanduser("~/.local/state"))
+        if appname:
+            path = os.path.join(path, appname)
+    if appname and version:
+        path = os.path.join(path, version)
+    return path
+def user_log_dir(appname=None, appauthor=None, version=None, opinion=True):
+    r"""Return full path to the user-specific log dir for this application.
+        "appname" is the name of application.
+            If None, just the system directory is returned.
+        "appauthor" (only used on Windows) is the name of the
+            appauthor or distributing body for this application. Typically
+            it is the owning company name. This falls back to appname. You may
+            pass False to disable it.
+        "version" is an optional version path element to append to the
+            path. You might want to use this if you want multiple versions
+            of your app to be able to run independently. If used, this
+            would typically be "<major>.<minor>".
+            Only applied when appname is present.
+        "opinion" (boolean) can be False to disable the appending of
+            "Logs" to the base app data dir for Windows, and "log" to the
+            base cache dir for Unix. See discussion below.
+    Typical user log directories are:
+        Mac OS X:   ~/Library/Logs/<AppName>
+        Unix:       ~/.cache/<AppName>/log  # or under $XDG_CACHE_HOME if defined
+        Win XP:     C:\Documents and Settings\<username>\Local Settings\Application Data\<AppAuthor>\<AppName>\Logs
+        Vista:      C:\Users\<username>\AppData\Local\<AppAuthor>\<AppName>\Logs
+    On Windows the only suggestion in the MSDN docs is that local settings
+    go in the `CSIDL_LOCAL_APPDATA` directory. (Note: I'm interested in
+    examples of what some windows apps use for a logs dir.)
+    OPINION: This function appends "Logs" to the `CSIDL_LOCAL_APPDATA`
+    value for Windows and appends "log" to the user cache dir for Unix.
+    This can be disabled with the `opinion=False` option.
+    """
+    if system == "darwin":
+        path = os.path.join(os.path.expanduser("~/Library/Logs"), appname)
+    elif system == "win32":
+        path = user_data_dir(appname, appauthor, version)
+        version = False
+        if opinion:
+            path = os.path.join(path, "Logs")
+    else:
+        path = user_cache_dir(appname, appauthor, version)
+        version = False
+        if opinion:
+            path = os.path.join(path, "log")
+    if appname and version:
+        path = os.path.join(path, version)
+    return path
+class AppDirs(object):
+    """Convenience wrapper for getting application dirs."""
+    def __init__(
+        self, appname=None, appauthor=None, version=None, roaming=False, multipath=False
+    ):
+        self.appname = appname
+        self.appauthor = appauthor
+        self.version = version
+        self.roaming = roaming
+        self.multipath = multipath
+    @property
+    def user_data_dir(self):
+        return user_data_dir(
+            self.appname, self.appauthor, version=self.version, roaming=self.roaming
+        )
+    @property
+    def site_data_dir(self):
+        return site_data_dir(
+            self.appname, self.appauthor, version=self.version, multipath=self.multipath
+        )
+    @property
+    def user_config_dir(self):
+        return user_config_dir(
+            self.appname, self.appauthor, version=self.version, roaming=self.roaming
+        )
+    @property
+    def site_config_dir(self):
+        return site_config_dir(
+            self.appname, self.appauthor, version=self.version, multipath=self.multipath
+        )
+    @property
+    def user_cache_dir(self):
+        return user_cache_dir(self.appname, self.appauthor, version=self.version)
+    @property
+    def user_state_dir(self):
+        return user_state_dir(self.appname, self.appauthor, version=self.version)
+    @property
+    def user_log_dir(self):
+        return user_log_dir(self.appname, self.appauthor, version=self.version)
+# ---- internal support stuff
+def _get_win_folder_from_registry(csidl_name):
+    """This is a fallback technique at best. I'm not sure if using the
+    registry for this guarantees us the correct answer for all CSIDL_*
+    names.
+    """
+    import winreg as _winreg
+    shell_folder_name = {
+        "CSIDL_APPDATA": "AppData",
+        "CSIDL_COMMON_APPDATA": "Common AppData",
+        "CSIDL_LOCAL_APPDATA": "Local AppData",
+    }[csidl_name]
+    key = _winreg.OpenKey(
+        _winreg.HKEY_CURRENT_USER,
+        r"Software\Microsoft\Windows\CurrentVersion\Explorer\Shell Folders",
+    )
+    dir, type = _winreg.QueryValueEx(key, shell_folder_name)
+    return dir
+def _get_win_folder_with_pywin32(csidl_name):
+    from win32com.shell import shell, shellcon
+    dir = shell.SHGetFolderPath(0, getattr(shellcon, csidl_name), 0, 0)
+    # Try to make this a unicode path because SHGetFolderPath does
+    # not return unicode strings when there is unicode data in the
+    # path.
+    try:
+        dir = unicode(dir)
+        # Downgrade to short path name if have highbit chars. See
+        # <http://bugs.activestate.com/show_bug.cgi?id=85099>.
+        has_high_char = False
+        for c in dir:
+            if ord(c) > 255:
+                has_high_char = True
+                break
+        if has_high_char:
+            try:
+                import win32api
+                dir = win32api.GetShortPathName(dir)
+            except ImportError:
+                pass
+    except UnicodeError:
+        pass
+    return dir
+def _get_win_folder_with_ctypes(csidl_name):
+    import ctypes
+    csidl_const = {
+        "CSIDL_APPDATA": 26,
+        "CSIDL_COMMON_APPDATA": 35,
+        "CSIDL_LOCAL_APPDATA": 28,
+    }[csidl_name]
+    buf = ctypes.create_unicode_buffer(1024)
+    ctypes.windll.shell32.SHGetFolderPathW(None, csidl_const, None, 0, buf)
+    # Downgrade to short path name if have highbit chars. See
+    # <http://bugs.activestate.com/show_bug.cgi?id=85099>.
+    has_high_char = False
+    for c in buf:
+        if ord(c) > 255:
+            has_high_char = True
+            break
+    if has_high_char:
+        buf2 = ctypes.create_unicode_buffer(1024)
+        if ctypes.windll.kernel32.GetShortPathNameW(buf.value, buf2, 1024):
+            buf = buf2
+    return buf.value
+def _get_win_folder_with_jna(csidl_name):
+    import array
+    from com.sun import jna
+    from com.sun.jna.platform import win32
+    buf_size = win32.WinDef.MAX_PATH * 2
+    buf = array.zeros("c", buf_size)
+    shell = win32.Shell32.INSTANCE
+    shell.SHGetFolderPath(
+        None,
+        getattr(win32.ShlObj, csidl_name),
+        None,
+        win32.ShlObj.SHGFP_TYPE_CURRENT,
+        buf,
+    )
+    dir = jna.Native.toString(buf.tostring()).rstrip("\0")
+    # Downgrade to short path name if have highbit chars. See
+    # <http://bugs.activestate.com/show_bug.cgi?id=85099>.
+    has_high_char = False
+    for c in dir:
+        if ord(c) > 255:
+            has_high_char = True
+            break
+    if has_high_char:
+        buf = array.zeros("c", buf_size)
+        kernel = win32.Kernel32.INSTANCE
+        if kernel.GetShortPathName(dir, buf, buf_size):
+            dir = jna.Native.toString(buf.tostring()).rstrip("\0")
+    return dir
+if system == "win32":
+    try:
+        import win32com.shell
+        _get_win_folder = _get_win_folder_with_pywin32
+    except ImportError:
+        try:
+            from ctypes import windll
+            _get_win_folder = _get_win_folder_with_ctypes
+        except ImportError:
+            try:
+                import com.sun.jna
+                _get_win_folder = _get_win_folder_with_jna
+            except ImportError:
+                _get_win_folder = _get_win_folder_from_registry
+# ---- self test code
+if __name__ == "__main__":
+    appname = "MyApp"
+    appauthor = "MyCompany"
+    props = (
+        "user_data_dir",
+        "user_config_dir",
+        "user_cache_dir",
+        "user_state_dir",
+        "user_log_dir",
+        "site_data_dir",
+        "site_config_dir",
+    )
+    print(f"-- app dirs {__version__} --")
+    print("-- app dirs (with optional 'version')")
+    dirs = AppDirs(appname, appauthor, version="1.0")
+    for prop in props:
+        print(f"{prop}: {getattr(dirs, prop)}")
+    print("\n-- app dirs (without optional 'version')")
+    dirs = AppDirs(appname, appauthor)
+    for prop in props:
+        print(f"{prop}: {getattr(dirs, prop)}")
+    print("\n-- app dirs (without optional 'appauthor')")
+    dirs = AppDirs(appname)
+    for prop in props:
+        print(f"{prop}: {getattr(dirs, prop)}")
+    print("\n-- app dirs (with disabled 'appauthor')")
+    dirs = AppDirs(appname, appauthor=False)
+    for prop in props:
+        print(f"{prop}: {getattr(dirs, prop)}")

torch/_awaits/__init__.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from __future__ import annotations
+from typing import cast, Callable, Generic, Type, TypeVar
+import torch
+__all__ = ['Await']
+W = TypeVar("W")
+class _PyAwaitMeta(type(torch._C._Await), type(Generic)):  # type: ignore[misc, no-redef]
+    pass
+class _Await(torch._C._Await, Generic[W], metaclass=_PyAwaitMeta):
+    r"""
+    Wrapper around a ``torch._C.Await`` which encapsulates delayed execution
+    of a callable. All manipulations happen with functions ``torch.jit._awaitable``,
+    ``torch.jit._awaitable_wait``, ``torch.jit._awaitable_nowait``.
+    Torch scriptable manipulations:
+    ``torch.jit._awaitable(func, *args)``
+    Creates ``Await[W]`` object, where W is return type of func.
+    Returns:
+    ``torch.jit._awaitable_wait(Await[W])``
+    Returns the result of the function, specified at ``_awaitable``,  with specified arguments.
+    Returns:
+        The result of type ``W`` of the function call. The result is owned by ``Await[W]``
+        and returned on all following ``_awaitable_wait`` calls.
+    ``torch.jit._awaitable_nowait(W)``
+    Returns:
+        Trivial ``Await[W]`` with specified result.
+    Only in eager mode:
+    ``fn() -> Callable[Tuple[Any], W]``
+    Returns:
+        Specified at ``_awaitable`` python function ``func``.
+    ``args() -> Tuple[Any]``
+    Returns:
+        Specified at ``_awaitable`` python args.
+    ``is_nowait() -> _bool``
+    Returns:
+        ``True`` if this object was created via ``_awaitable_nowait`` call (trivial `Await[W]`).
+    In eager mode ``Await[W]`` can be used as ``W`` i.e. attributes of W can be called on ``Await[W]``,
+    ``_awaitable_wait()`` call will be transparently added.
+    """
+    pass

torch/_awaits/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (2.08 kB). View file

torch/_classes.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import types
+import torch._C
+class _ClassNamespace(types.ModuleType):
+    def __init__(self, name):
+        super().__init__("torch.classes" + name)
+        self.name = name
+    def __getattr__(self, attr):
+        proxy = torch._C._get_custom_class_python_wrapper(self.name, attr)
+        if proxy is None:
+            raise RuntimeError(f"Class {self.name}.{attr} not registered!")
+        return proxy
+class _Classes(types.ModuleType):
+    __file__ = "_classes.py"
+    def __init__(self):
+        super().__init__("torch.classes")
+    def __getattr__(self, name):
+        namespace = _ClassNamespace(name)
+        setattr(self, name, namespace)
+        return namespace
+    @property
+    def loaded_libraries(self):
+        return torch.ops.loaded_libraries
+    def load_library(self, path):
+        """
+        Loads a shared library from the given path into the current process.
+        The library being loaded may run global initialization code to register
+        custom classes with the PyTorch JIT runtime. This allows dynamically
+        loading custom classes. For this, you should compile your class
+        and the static registration code into a shared library object, and then
+        call ``torch.classes.load_library('path/to/libcustom.so')`` to load the
+        shared object.
+        After the library is loaded, it is added to the
+        ``torch.classes.loaded_libraries`` attribute, a set that may be inspected
+        for the paths of all libraries loaded using this function.
+        Args:
+            path (str): A path to a shared library to load.
+        """
+        torch.ops.load_library(path)
+# The classes "namespace"
+classes = _Classes()

torch/_compile.py ADDED Viewed

	@@ -0,0 +1,30 @@

+"""
+APIs related to torch.compile which lazily import torch._dynamo to avoid
+circular dependencies.
+"""
+import functools
+def _disable_dynamo(fn=None, recursive=True):
+    """
+    This API should be only used inside torch, external users should still use
+    torch._dynamo.disable. The main goal of this API is to avoid circular
+    imports issues that is common while using _dynamo.disable inside torch
+    itself.
+    This API avoids it by lazily importing torch._dynamo from the import time to
+    the invocation of the decorated function.
+    """
+    if fn is not None:
+        @functools.wraps(fn)
+        def inner(*args, **kwargs):
+            import torch._dynamo
+            return torch._dynamo.disable(fn, recursive)(*args, **kwargs)
+        return inner
+    else:
+        # decorator usage like @_disable_dynamo(recursive=False). The resulting
+        # object expects the original decorated function as the arg.
+        return functools.partial(_disable_dynamo, recursive=recursive)

torch/_custom_op/__init__.py ADDED Viewed

File without changes

torch/_custom_op/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (156 Bytes). View file

torch/_custom_op/__pycache__/autograd.cpython-310.pyc ADDED Viewed

Binary file (8.86 kB). View file

torch/_custom_op/__pycache__/functional.cpython-310.pyc ADDED Viewed

Binary file (5.95 kB). View file

torch/_custom_op/__pycache__/impl.cpython-310.pyc ADDED Viewed

Binary file (33.5 kB). View file

torch/_custom_op/autograd.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import torch
+import torch.utils._pytree as pytree
+from collections import namedtuple
+import functools
+# NOTE [CustomOp autograd kernel indirection]
+# We register `inner` as the autograd kernel for this custom_op.
+# `inner` either calls the autograd formula registered by the user,
+# or goes into an `autograd_not_implemented` kernel.
+#
+# The reason why this indirection exists is
+# so that we can swap out the autograd kernel (the PyTorch dispatcher
+# doesn't actually allow us to do this). By default, we want
+# the `autograd_not_implemented` behavior, but then the user may come
+# and register something that is actually a backward formula
+def autograd_kernel_indirection(custom_op):
+    autograd_fallback = autograd_not_implemented(custom_op)
+    def inner(*args, **kwargs):
+        if custom_op._has_impl('autograd'):
+            kernel = custom_op._get_impl('autograd').func
+            return kernel(*args, **kwargs)
+        # As explained in NOTE ["backward", "save_for_backward", and "autograd"],
+        # after the user gives us "backward" and "save_for_backward", we generate
+        # the "autograd" impl. If the user only provided one, then we tell
+        # the user they've done something wrong.
+        if custom_op._has_impl('save_for_backward') or custom_op._has_impl('backward'):
+            missing = (
+                'save_for_backward' if custom_op._has_impl('backward')
+                else 'backward'
+            )
+            found = 'save_for_backward' if missing == 'backward' else 'backward'
+            loc = custom_op._get_impl(found).location
+            raise RuntimeError(
+                f"We found a '{found}' registration for {custom_op} at "
+                f"{loc} but were unable to find a '{missing}' registration. "
+                f"To use the CustomOp API to register a backward formula, "
+                f"please provide us both a backward function and a "
+                f"'save for backward' function via `impl_backward` and "
+                f"`impl_save_for_backward` respectively.")
+        return autograd_fallback(*args, **kwargs)
+    return inner
+# TODO(#101191): Use the actual C++ autograd not implemented fallback,
+# or change the default autograd fallback to the autograd not implemented fallback.
+def autograd_not_implemented(custom_op):
+    def kernel(*args, **kwargs):
+        if torch.is_grad_enabled() and pytree.tree_any(
+            lambda x: isinstance(x, torch.Tensor) and x.requires_grad, (args, kwargs)
+        ):
+            raise RuntimeError("Autograd has not been implemented for operator")
+        with torch._C._AutoDispatchBelowAutograd():
+            return custom_op(*args, **kwargs)
+    return kernel
+def mark_non_differentiable(ctx, output, output_differentiability):
+    # Output types are restricted to be:
+    # - Tensor
+    # - Tensor[]
+    # - int, bool, Scalar, float
+    # See _check_can_register_backward
+    if output_differentiability is not None:
+        if not isinstance(output, tuple):
+            tuple_output = (output,)
+        else:
+            tuple_output = output  # type: ignore[assignment]
+        assert len(output_differentiability) == len(tuple_output)
+        non_differentiable_tensors = []
+        for idx, (differentiable, out) in enumerate(zip(output_differentiability, tuple_output)):
+            if isinstance(out, torch.Tensor):
+                if not differentiable:
+                    non_differentiable_tensors.append(out)
+                continue
+            if isinstance(out, list):
+                if not differentiable:
+                    non_differentiable_tensors.extend(out)
+                continue
+            if differentiable:
+                raise RuntimeError(
+                    f"With output_differentiability={output_differentiability}. "
+                    f"At idx {idx}, we received an object of type {type(out)} that "
+                    f"is not a Tensor, so it cannot have be marked as differentiable in "
+                    f"output_differentiability.")
+        if non_differentiable_tensors:
+            ctx.mark_non_differentiable(*non_differentiable_tensors)
+def construct_autograd_kernel(
+        schema,
+        output_differentiability,
+        custom_op,
+        op_overload,
+        save_for_backward_fn,
+        backward_fn):
+    def apply(*args):
+        flat_args, spec = pytree.tree_flatten(args)
+        out_spec = None
+        def forward(ctx, *flat_args):
+            ctx.set_materialize_grads(True)
+            args = pytree.tree_unflatten(list(flat_args), spec)
+            with torch._C._AutoDispatchBelowAutograd():
+                output = op_overload(*args)
+            # We use the info about args to give better error messages in backward
+            args_info = namedtuple_args(
+                schema, pytree.tree_map(type, args))
+            save_for_backward_fn_inputs = namedtuple_args(schema, args)
+            to_save = save_for_backward_fn(save_for_backward_fn_inputs, output)
+            save_pytree_for_backward(ctx, (to_save, args_info))
+            mark_non_differentiable(ctx, output, output_differentiability)
+            nonlocal out_spec
+            flat_output, out_spec = pytree.tree_flatten(output)
+            return tuple(flat_output)
+        def backward(ctx, *flat_grad_output):
+            assert out_spec is not None
+            grads = pytree.tree_unflatten(list(flat_grad_output), out_spec)
+            saved, args_info = unpack_saved(ctx)
+            # There is nothing on the ctx object for now, it is just there so
+            # that we can add additional things in the future.
+            inner_ctx = object()
+            if not isinstance(grads, tuple):
+                grads = (grads,)
+            grad_inputs_dict = backward_fn(inner_ctx, saved, *grads)
+            # Massage the grad_inputs_dict to a form acceptable by
+            # autograd.Function.
+            validate_grad_inputs_dict(grad_inputs_dict, custom_op, args_info)
+            return grad_inputs_dict_to_flat_tuple(grad_inputs_dict, args_info)
+        generated_cls = gen_autograd_function(
+            custom_op._opname + '_customop', forward, backward)
+        flat_output = generated_cls.apply(*flat_args)
+        assert out_spec is not None
+        return pytree.tree_unflatten(list(flat_output), out_spec)
+    return apply
+def gen_autograd_function(name, forward, backward):
+    generated_cls = type(
+        name,
+        (torch.autograd.Function,),
+        {
+            'forward': staticmethod(forward),
+            'backward': staticmethod(backward),
+        }
+    )
+    return generated_cls
+@functools.lru_cache
+def namedtuple_args_cls(schema):
+    attribs = [arg.name for arg in schema.arguments.flat_all]
+    name = str(schema.name) + "_args"
+    # mypy doesn't support dynamic namedtuple name
+    tuple_cls = namedtuple(name, attribs)  # type: ignore[misc]
+    return tuple_cls
+def namedtuple_args(schema, args):
+    assert isinstance(args, tuple)
+    tuple_cls = namedtuple_args_cls(schema)
+    return tuple_cls(*args)
+def validate_grad_inputs_dict(grad_inputs_dict, forward_op, args_info):
+    def error(what):
+        backward = forward_op._get_impl('backward')
+        raise RuntimeError(
+            f"In the backward function defined for {forward_op} at "
+            f"{backward.location} using the CustomOp API, {what}")
+    if not isinstance(grad_inputs_dict, dict):
+        error(f"expected the output of the backward function to be a dict but "
+              f"got {type(grad_inputs_dict)}")
+    expected_keys = {arg.name for arg in forward_op._schema.arguments.flat_all
+                     if arg.type.is_tensor_like()}
+    actual_keys = grad_inputs_dict.keys()
+    if expected_keys != actual_keys:
+        error(f"expected the returned grad_input dict to have keys "
+              f"{expected_keys} but got {actual_keys}. The backward "
+              f"function must return a gradient (can be None) for each arg "
+              f"to the CustomOp that may be a Tensor or Sequence[Tensor]. "
+              f"Args declared to be non-Tensor-like types should not appear "
+              f"in the grad_input dict")
+    for name, grad in grad_inputs_dict.items():
+        arg_info = getattr(args_info, name)
+        if isinstance(arg_info, list):
+            if not isinstance(grad, (tuple, list)):
+                error(f"for input '{name}' expected the grad_input dict to "
+                      f"hold a list of gradients but got object of type "
+                      f"{type(grad)}.")
+            if not len(grad) == len(arg_info):
+                error(f"for input '{name}' expected the grad_input dict to "
+                      f"hold a list of {len(arg_info)} gradients but got "
+                      f"{len(grad)}")
+            for idx, (g, info) in enumerate(zip(grad, arg_info)):
+                if g is None:
+                    continue
+                if not isinstance(g, torch.Tensor):
+                    error(f"for input '{name}' expected the grad_input dict to "
+                          f"hold a list of None or Tensor gradients but got "
+                          f"object of {type(g)} at index {idx}")
+                if not issubclass(info, torch.Tensor):
+                    error(f"for input '{name}', got a Tensor as the gradient "
+                          f"for the {idx}-th value but expected None because "
+                          f"the {idx}-th value was not a Tensor (it was "
+                          f"type {arg_info}")
+            continue
+        if grad is None:
+            continue
+        if not isinstance(grad, torch.Tensor):
+            error(f"got object of type {type(grad)} as the gradient for input "
+                  f"'{name}', "
+                  f"but expected the gradient to be either None or a Tensor")
+        if not issubclass(arg_info, torch.Tensor):
+            error(f"got a Tensor as the gradient for input '{name}' but "
+                  f"expected None as the gradient because input '{name}' "
+                  f"was not a Tensor (it was type {arg_info}).")
+def grad_inputs_dict_to_flat_tuple(grad_inputs_dict, args_info):
+    result = []
+    for name, arg_info in args_info._asdict().items():
+        if name not in grad_inputs_dict:
+            result.append(pytree.tree_map(lambda x: None, arg_info))
+            continue
+        result.append(grad_inputs_dict[name])
+    return tuple(pytree.tree_leaves(result))
+# Saves "stuff" (a pytree) onto the ctx object. Use unpack_saved to unpack it.
+# autograd.Function prefers that users use ctx.save_for_backward to
+# save Tensors (to avoid reference cycles) and for non-Tensors to go onto the
+# ctx object.
+def save_pytree_for_backward(ctx, stuff):
+    flat_stuff, spec = pytree.tree_flatten(stuff)
+    num_elts = len(flat_stuff)
+    tensor_idxs = [idx for idx, thing in enumerate(flat_stuff)
+                   if isinstance(thing, torch.Tensor)]
+    non_tensor_idxs = [idx for idx, thing in enumerate(flat_stuff)
+                       if not isinstance(thing, torch.Tensor)]
+    tensors = [thing for thing in flat_stuff if isinstance(thing, torch.Tensor)]
+    non_tensors = [thing for thing in flat_stuff if not isinstance(thing, torch.Tensor)]
+    ctx.spec = spec
+    ctx.num_elts = num_elts
+    ctx.save_for_backward(*tensors)
+    ctx.tensor_idxs = tensor_idxs
+    ctx.saved_non_tensors = non_tensors
+    ctx.non_tensor_idxs = non_tensor_idxs
+# Inverse operation to save_pytree_for_backward
+def unpack_saved(ctx):
+    flat_stuff = [None] * ctx.num_elts
+    for tensor, idx in zip(ctx.saved_tensors, ctx.tensor_idxs):
+        flat_stuff[idx] = tensor
+    for non_tensor, idx in zip(ctx.saved_non_tensors, ctx.non_tensor_idxs):
+        flat_stuff[idx] = non_tensor
+    stuff = pytree.tree_unflatten(flat_stuff, ctx.spec)
+    return stuff

torch/_custom_op/functional.py ADDED Viewed

	@@ -0,0 +1,187 @@

+import weakref
+import torch
+import torch.utils._pytree as pytree
+from torch._C import _ExcludeDispatchKeyGuard, DispatchKey, DispatchKeySet
+from torch._ops import OpOverload
+from torch.library import Library
+from torchgen.model import (
+    BaseTy,
+    BaseType,
+    FunctionSchema,
+    OperatorName,
+    OptionalType,
+    SchemaKind,
+)
+from .autograd import autograd_not_implemented
+def register_functional_op(
+    lib: Library,
+    new_op_name: str,
+    mutable_op: OpOverload,
+) -> None:
+    """Given a mutable operator, registers the functional variant.
+    This API also correctly links the functional variant with the mutable
+    operator for the purposes of functionalization.
+    All of the new registrations are performed on the ``lib`` passed in.
+    Arguments:
+        lib (Library): Should be a torch.library.Library object that has
+            the same namespace as ``mutable_op``'s namespace.
+            lib will be used to register the new functional op as well
+            as a functionalization kernel for the ``mutable_op``
+            If you don't have a library handy, use
+            ``torch.library.Library(ns, 'FRAGMENT')`` to construct one.
+        new_op_name (str): The name of the functional operator (without the
+            namespace). If no namespace, the new functional variant will be
+            accessible under ``torch.ops.{lib.ns}.new_op_name``.
+        mutable_op (OpOverload): The mutable custom operator. Note
+            that you may need to add a `.default` to it, like
+            `torch.ops.aten.abs_.default`.
+    """
+    validate(mutable_op)
+    schema = functional_schema(new_op_name, mutable_op)
+    lib.define(schema)
+    functional_impl = construct_functional_impl(mutable_op)
+    lib.impl(new_op_name, functional_impl, 'CompositeExplicitAutograd')
+    functional_op = getattr(getattr(torch.ops, lib.ns), new_op_name).default
+    # There's no easy way for us to generate the autograd kernel, so we
+    # use autograd_not_implemented. Also, this makes it so that the user
+    # is unable to register an autograd formula themselves. This shouldn't
+    # be a problem if the user doesn't use the functional op direclty
+    # in their program, but we may need to revist this in the future.
+    lib.impl(new_op_name, autograd_not_implemented(functional_op), 'Autograd')
+    f_kernel = construct_functionalization_kernel(weakref.proxy(mutable_op), functional_op)
+    lib.impl(mutable_op, f_kernel, 'Functionalize')
+def construct_functional_impl(mutable_op):
+    def functional_impl(*args):
+        # Strategy:
+        # - clone args that would have been mutated
+        # - run mutable_op
+        # - return the cloned args as additional outputs
+        new_args = []
+        extra_rets = []
+        for is_write, arg in zip(mutable_args(mutable_op), args):
+            if is_write:
+                cloned = arg.clone() if arg is not None else None
+                new_args.append(cloned)
+                extra_rets.append(cloned)
+            else:
+                new_args.append(arg)
+        result = mutable_op(*new_args)
+        if result is None:
+            return tuple(extra_rets)
+        if isinstance(result, tuple):
+            return (*result, *extra_rets)
+        return (result, *extra_rets)
+    return functional_impl
+def construct_functionalization_kernel(mutable_op, functional_op):
+    def kernel(*args):
+        # There's nothing to be functionalized!
+        # We can still end up here because DispatchKey::Functionalize is a mode key
+        if pytree.tree_all_only(torch.Tensor, lambda x: not torch._is_functional_tensor(x), args):
+            with _ExcludeDispatchKeyGuard(DispatchKeySet(DispatchKey.Functionalize)):
+                return mutable_op(*args)
+        # NB: This differs from the codegen -- codegen handles cases where there
+        # are mixed FunctionalTensorWrapper and non-FunctionalTensorWrapper.
+        # This only really matters for XLA (mixed CPU-XLA tensors) and
+        # running functionalization without the PT2 stack (which guarantees to us that
+        # all tensors are FunctionalTensorWrapper).
+        if not pytree.tree_all_only(torch.Tensor, torch._is_functional_tensor, args):
+            raise RuntimeError("{mutable_op}: expected all args to be FunctionalTensorWrapper")
+        unwrapped_args = []
+        for arg in args:
+            if isinstance(arg, torch.Tensor) and torch._is_functional_tensor(arg):
+                torch._sync(arg)
+                unwrapped = torch._from_functional_tensor(arg)
+                unwrapped_args.append(unwrapped)
+            else:
+                unwrapped_args.append(arg)
+        with _ExcludeDispatchKeyGuard(DispatchKeySet(DispatchKey.Functionalize)):
+            output = functional_op(*unwrapped_args)
+        num_actual_output = len(mutable_op._schema.returns)
+        actual_output = pytree.tree_map(
+            torch._to_functional_tensor, output[:num_actual_output])
+        new_values_to_propagate = output[num_actual_output:]
+        inputs_to_replace = [arg for is_write, arg in zip(mutable_args(mutable_op), args)
+                             if is_write]
+        assert len(new_values_to_propagate) == len(inputs_to_replace)
+        for new_value, arg in zip(new_values_to_propagate, inputs_to_replace):
+            if (arg is None and new_value is None) or (arg is not None and new_value is not None):
+                continue
+            torch._C._propagate_xla_data(arg, new_value)
+            torch._C._replace_(arg, new_value)
+            torch._C._commit_update(arg)
+            torch._sync(arg)
+        if len(actual_output) == 1:
+            return actual_output[0]
+        elif len(actual_output) == 0:
+            return None
+        return actual_output
+    return kernel
+def validate(mutable_op: OpOverload):
+    if not isinstance(mutable_op, OpOverload):
+        raise TypeError(
+            f"register_functional_op(mutable_op): expected mutable_op to be instance of "
+            f"OpOverload but got {type(mutable_op)}")
+    # There are generally three types of "in-place" or "mutable" ops.
+    # Each of them have their own conventions:
+    # - inplace (first input modified in-place and returned as only output)
+    # - out= (some args modified in-place and returned as outputs)
+    # - mutable (some args modified in-place but none of those returned as outputs)
+    # In theory we can support all three, but we'll just support the last
+    # option right now for simplicity.
+    schema = FunctionSchema.parse(str(mutable_op._schema))
+    if not schema.kind() == SchemaKind.mutable:
+        raise RuntimeError("Expected op to be mutable (as opposed to functional, inplace or out)")
+    for ret in schema.returns:
+        # construct_functionalization_kernel assumes this for simplicity
+        if ret.annotation is not None:
+            raise NotImplementedError(
+                "NYI: register_functional_op(op) where op returns a mutated or aliased value. "
+                "Please file an issue (and as a workaround, modify your operator to "
+                "not return the mutated value or aliases)")
+    for arg in schema.arguments.flat_all:
+        # construct_functionalization_kernel assumes this for simplicity
+        if arg.type.is_tensor_like() and (
+            arg.type != BaseType(BaseTy.Tensor)
+            and arg.type != OptionalType(BaseType(BaseTy.Tensor))
+        ):
+            raise NotImplementedError(
+                "NYI: register_functional_op(op) where op has a List[Tensor] input."
+                "Please file an issue.")
+def functional_schema(new_op_name, op: OpOverload):
+    schema = FunctionSchema.parse(str(op._schema))
+    schema = schema.signature().with_name(OperatorName.parse(new_op_name))
+    return str(schema)
+def mutable_args(op: OpOverload):
+    return tuple(False if arg.alias_info is None else arg.alias_info.is_write
+                 for arg in op._schema.arguments)

torch/_custom_op/impl.py ADDED Viewed

	@@ -0,0 +1,976 @@

+import dataclasses
+import functools
+import inspect
+import sys
+import typing
+import weakref
+from torchgen.model import FunctionSchema, OperatorName, SchemaKind, BaseType, ListType, BaseTy
+import torch
+import torch._C as _C
+import torch.library as library
+from torch._library.abstract_impl import AbstractImplCtx
+from torch.library import get_ctx
+from .autograd import autograd_kernel_indirection, construct_autograd_kernel
+"""
+For a detailed guide on custom ops, please see
+https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+This file includes pieces of the implementation of our custom operator API.
+"""
+__all__ = ["custom_op", "CustomOp", "get_ctx", "AbstractImplCtx"]
+SUPPORTED_DEVICE_TYPE_TO_KEY = {
+    "cpu": "CPU",
+    "cuda": "CUDA",
+}
+# We will not let users register CustomOps with anything that could look like
+# PyTorch internals to avoid confusion.
+RESERVED_NS = {
+    "prim",
+    "prims",
+    "aten",
+    "at",
+    "torch",
+    "pytorch",
+}
+def custom_op(
+    qualname: str, manual_schema: typing.Optional[str] = None
+) -> typing.Callable:
+    r"""Creates a new CustomOp object.
+    WARNING: if you're a user, please do not use this directly
+    (instead use the torch._custom_ops APIs).
+    Also please see the following for a detailed guide on custom ops.
+    https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+    In PyTorch, defining an op (short for "operator") is a two step-process:
+    - we need to define (create) the op
+    - we need to implement behavior for how the operator interacts with
+      various PyTorch subsystems, like CPU/CUDA Tensors, Autograd, etc.
+    This entrypoint defines the CustomOp object (the first step);
+    you must then perform the second step by calling various methods on
+    the CustomOp object.
+    This API is used as a decorator (see examples).
+    Arguments:
+        qualname (str): Should be a string that looks like
+            "namespace::operator_name". Operators in PyTorch need a namespace to
+            avoid name collisions; a given operator may only be created once.
+            If you are writing a Python library, we recommend the namespace to
+            be the name of your top-level module. The operator_name must be
+            the same as the name of the function you pass to custom_op
+            (see examples).
+        manual_schema (Optional[str]): Each PyTorch operator needs a schema that
+            tells PyTorch the types of the inputs/outputs. If None (default),
+            we will infer the schema from the type annotations on the function
+            (see examples). Otherwise, if you don't want to use type annotations,
+            you may provide us the schema string.
+    Example::
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> import numpy as np
+        >>> from torch import Tensor
+        >>>
+        >>> # Step 1: define the CustomOp.
+        >>> # We need to provide the decorator a "prototype function"
+        >>> # (a function with Python ellipses as the body).
+        >>> @custom_op("my_library::numpy_sin")
+        >>> def numpy_sin(x: Tensor) -> Tensor:
+        >>>     ...
+        >>>
+        >>> # numpy_sin is now an instance of class CustomOp
+        >>> print(type(numpy_sin))
+        >>>
+        >>> # Step 2: Register an implementation for various PyTorch subsystems
+        >>>
+        >>> # Register an implementation for CPU tensors
+        >>> @numpy_sin.impl('cpu')
+        >>> def numpy_sin_impl_cpu(x):
+        >>>     return torch.from_numpy(np.sin(x.numpy()))
+        >>>
+        >>> # Register an implementation for CUDA tensors
+        >>> @numpy_sin.impl('cuda')
+        >>> def numpy_sin_impl_cuda(x):
+        >>>     return torch.from_numpy(np.sin(x.cpu().numpy())).to(x.device)
+        >>>
+        >>> x = torch.randn(3)
+        >>> numpy_sin(x)  # calls numpy_sin_impl_cpu
+        >>>
+        >>> x_cuda = x.cuda()
+        >>> numpy_sin(x)  # calls numpy_sin_impl_cuda
+    """
+    def inner(func):
+        if not inspect.isfunction(func):
+            raise ValueError(
+                f"custom_op(...)(func): Expected `func` to be a Python "
+                f"function, got: {type(func)}"
+            )
+        ns, name = parse_qualname(qualname)
+        validate_namespace(ns)
+        if func.__name__ != name:
+            raise ValueError(
+                f"custom_op(qualname='{qualname}', ...)(func): expected `func` "
+                f"to have name '{name}' but got '{func.__name__}'. "
+                f"Please either change the name of `func` or the qualname that "
+                f"is passed to `custom_op`"
+            )
+        schema = infer_schema(func) if manual_schema is None else manual_schema
+        schema_str = f"{name}{schema}"
+        function_schema = FunctionSchema.parse(schema_str)
+        validate_schema(function_schema)
+        if manual_schema is not None:
+            validate_function_matches_schema(function_schema, func)
+        lib = library.Library(ns, "FRAGMENT")
+        lib.define(schema_str)
+        ophandle = find_ophandle_or_throw(ns, function_schema.name)
+        result = CustomOp(lib, ns, function_schema, name, ophandle, _private_access=True)
+        result.__name__ = func.__name__
+        result.__module__ = func.__module__
+        result.__doc__ = func.__doc__
+        library.impl(lib, result._opname, "Autograd")(
+            autograd_kernel_indirection(weakref.proxy(result))
+        )
+        torch._C._dispatch_set_report_error_callback(
+            ophandle, functools.partial(report_error_callback, weakref.proxy(result))
+        )
+        return result
+    return inner
+# Global dictionary holding references to all CustomOp objects
+# Yes, it keeps all CustomOps alive (see NOTE [CustomOp lifetime])
+# Used to query the CustomOp associated with a specific C++ dispatcher operator.
+# An example usage is FakeTensor: FakeTensor checks if a specific operator
+# has an implementation registered via the CustomOp API.
+# Indexed by qualname (e.g. aten::foo)
+global_registry: typing.Dict[str, "CustomOp"] = {}
+class CustomOp:
+    r"""Class for custom operators in PyTorch.
+    Use the CustomOp API to create user-defined custom operators that behave
+    just like regular PyTorch operators (e.g. torch.sin, torch.mm) when it
+    comes to various PyTorch subsystems (like torch.compile).
+    To construct a `CustomOp`, use `custom_op`.
+    """
+    def __init__(self, lib, cpp_ns, schema, operator_name, ophandle, *, _private_access=False):
+        super().__init__()
+        if not _private_access:
+            raise RuntimeError(
+                "The CustomOp constructor is private and we do not guarantee "
+                "BC for it. Please use custom_op(...) to create a CustomOp object"
+            )
+        name = f"{cpp_ns}::{operator_name}"
+        self._schema = schema
+        self._cpp_ns = cpp_ns
+        self._lib: library.Library = lib
+        self._ophandle: _C._DispatchOperatorHandle = ophandle
+        # Has the name of the op, e.g. "foo". We cache here for convenience.
+        self._opname: str = operator_name
+        # this is _opname but with namespace. e.g. "custom::foo"
+        self._qualname: str = name
+        self.__name__ = None  # mypy requires this
+        # NB: Some of these impls are registered as kernels to DispatchKeys.
+        # Modifying the _impls dict directly won't do anything in that case.
+        self._impls: typing.Dict[str, typing.Optional[FuncAndLocation]] = {}
+        # See NOTE [CustomOp autograd kernel indirection]
+        self._registered_autograd_kernel_indirection = False
+        global_registry[self._qualname] = self
+    def _register_autograd_kernel_indirection(self):
+        assert not self._registered_autograd_kernel_indirection
+        self._lib.impl(self._opname, autograd_kernel_indirection(weakref.proxy(self)), "Autograd")
+        self._registered_autograd_kernel_indirection = True
+    # Records the impl and the source location in self._impls
+    # Note that this doesn't cause torch.library to use the impl, that
+    # needs to be done in a separate self._lib.impl call.
+    def _register_impl(self, kind, func, stacklevel=2):
+        if self._has_impl(kind):
+            func_and_location = self._impls[kind]
+            assert func_and_location is not None  # Pacify mypy
+            location = func_and_location.location
+            raise RuntimeError(
+                f"Attempting to register a {kind} impl for operator {self._qualname} "
+                f"that already has a {kind} impl registered from Python at "
+                f"{location}. This is not supported."
+            )
+        frame = inspect.getframeinfo(sys._getframe(stacklevel))
+        location = f"{frame.filename}:{frame.lineno}"
+        self._impls[kind] = FuncAndLocation(func, location)
+    def _get_impl(self, kind):
+        return self._impls[kind]
+    def _has_impl(self, kind):
+        return kind in self._impls
+    def _destroy(self):
+        # NOTE: [CustomOp lifetime]
+        # A CustomOp, once created, lives forever. The mechanism is that the
+        # global registry holds a reference to it. However, to make testing
+        # easier, we want to be able to destroy CustomOp objects.
+        # CustomOp._destroy does the job, though it leaves the CustomOp
+        # in a garbage state.
+        del self._lib
+        opnamespace = getattr(torch.ops, self._cpp_ns)
+        if hasattr(opnamespace, self._opname):
+            delattr(opnamespace, self._opname)
+        del global_registry[self._qualname]
+    def __repr__(self):
+        return f'<CustomOp(op="{self._qualname}")>'
+    def __call__(self, *args, **kwargs):
+        # Bypass torch.ops.* and directly do OperatorHandle::callBoxed.
+        # Using torch.ops.* is a bit of a pain (it can be slow and it has lifetime
+        # issues from caching operators that make testing CustomOp difficult).
+        result = _C._dispatch_call_boxed(self._ophandle, *args, **kwargs)
+        return result
+    def impl(
+        self, device_types: typing.Union[str, typing.Iterable[str]], _stacklevel=2,
+    ) -> typing.Callable:
+        r"""Register an implementation for a device type for this CustomOp object.
+        WARNING: if you're a user, please do not use this directly
+        (instead use the torch._custom_ops APIs).
+        Also please see the following for a detailed guide on custom ops.
+        https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+        If the CustomOp is passed multiple Tensor inputs with different device
+        types, it will dispatch to the registered implementation for the highest
+        priority device type among those present.
+        The supported device types, in order of priority, are {'cuda', 'cpu'}.
+        This API is used as a decorator (see examples).
+        Arguments:
+            device_types (str or Iterable[str]): the device type(s) to register the function for.
+        Examples::
+            >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+            >>> import numpy as np
+            >>> from torch import Tensor
+            >>>
+            >>> @custom_op("my_library::numpy_cos")
+            >>> def numpy_cos(x: Tensor) -> Tensor:
+            >>>     ...
+            >>>
+            >>> # Register an implementation for CPU Tensors
+            >>> @numpy_cos.impl('cpu')
+            >>> def numpy_cos_impl_cpu(x):
+            >>>     return torch.from_numpy(np.cos(x.numpy()))
+            >>>
+            >>> # Register an implementation for CUDA Tensors
+            >>> @numpy_cos.impl('cuda')
+            >>> def numpy_cos_impl_cuda(x):
+            >>>     return torch.from_numpy(np.cos(x.cpu().numpy())).to(x.device)
+            >>>
+            >>> x = torch.randn(3)
+            >>> numpy_cos(x)  # calls numpy_cos_impl_cpu
+            >>>
+            >>> x_cuda = x.cuda()
+            >>> numpy_cos(x)  # calls numpy_cos_impl_cuda
+        """
+        if isinstance(device_types, str):
+            device_types = [device_types]
+        for device_type in device_types:
+            validate_device_type(device_type)
+        def inner(f):
+            for device_type in set(device_types):
+                self._check_doesnt_have_library_impl(device_type)
+                self._register_impl(device_type, f, stacklevel=_stacklevel)
+                dispatch_key = SUPPORTED_DEVICE_TYPE_TO_KEY[device_type]
+                library.impl(self._lib, self._opname, dispatch_key)(f)
+            return f
+        return inner
+    def _check_doesnt_have_library_impl(self, device_type):
+        if self._has_impl(device_type):
+            return
+        key = SUPPORTED_DEVICE_TYPE_TO_KEY[device_type]
+        if _C._dispatch_has_computed_kernel_for_dispatch_key(self._qualname, key):
+            raise RuntimeError(
+                f"impl(..., device_types={device_type}): the operator {self._qualname} "
+                f"already has an implementation for this device type via a "
+                f"pre-existing torch.library or TORCH_LIBRARY registration.")
+    def impl_factory(self) -> typing.Callable:
+        r"""Register an implementation for a factory function."""
+        def inner(f):
+            self._register_impl("factory", f)
+            library.impl(self._lib, self._opname, "BackendSelect")(f)
+            return f
+        return inner
+    def impl_abstract(self, _stacklevel=2) -> typing.Callable:
+        r"""Register an abstract implementation for this operator.
+        WARNING: please do not use this directly (and instead use the torch._custom_ops
+        APIs). Also please see the following for a detailed guide on custom ops.
+        https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+        An "abstract implementation" specifies the behavior of this operator on
+        Tensors that carry no data. Given some input Tensors with certain properties
+        (sizes/strides/storage_offset/device), it specifies what the properties of
+        the output Tensors are.
+        The abstract implementation has the same signature as the operator.
+        It is run for both FakeTensors and meta tensors. To write an abstract
+        implementation, assume that all Tensor inputs to the operator are
+        regular CPU/CUDA/Meta tensors, but they do not have storage, and
+        you are trying to return regular CPU/CUDA/Meta tensor(s) as output.
+        The abstract implementation must consist of only PyTorch operations
+        (and may not directly access the storage or data of any input or
+        intermediate Tensors).
+        This API is used as a decorator (see examples).
+        Examples::
+            >>> import numpy as np
+            >>> from torch import Tensor
+            >>>
+            >>> # Example 1: an operator without data-dependent output shape
+            >>> @custom_op('my_library::custom_linear')
+            >>> def custom_linear(x: Tensor, weight: Tensor, bias: Tensor) -> Tensor:
+            >>>     ...
+            >>>
+            >>> @custom_linear.impl_abstract()
+            >>> def custom_linear_abstract(x, weight):
+            >>>     assert x.dim() == 2
+            >>>     assert weight.dim() == 2
+            >>>     assert bias.dim() == 1
+            >>>     assert x.shape[1] == weight.shape[1]
+            >>>     assert weight.shape[0] == bias.shape[0]
+            >>>     assert x.device == weight.device
+            >>>
+            >>>     return (x @ weight.t()) + bias
+            >>>
+            >>> # Example 2: an operator with data-dependent output shape
+            >>> @custom_op('my_library::custom_nonzero')
+            >>> def custom_nonzero(x: Tensor) -> Tensor:
+            >>>     ...
+            >>>
+            >>> @custom_nonzero.impl_abstract()
+            >>> def custom_nonzero_abstract(x):
+            >>>     # Number of nonzero-elements is data-dependent.
+            >>>     # Since we cannot peek at the data in an abstract impl,
+            >>>     # we use the ctx object to construct a new symint that
+            >>>     # represents the data-dependent size.
+            >>>     ctx = torch._custom_op.get_ctx()
+            >>>     nnz = ctx.create_unbacked_symint()
+            >>>     shape = [x.dim(), nnz]
+            >>>     result = x.new_empty(shape, dtype=torch.long)
+            >>>     return result
+            >>>
+            >>> @custom_nonzero.impl(['cpu', 'cuda'])
+            >>> def custom_nonzero_impl(x):
+            >>>     x_np = to_numpy(x)
+            >>>     res = np.stack(np.nonzero(x_np), axis=1)
+            >>>     # unbacked symbolic ints in PyTorch must be >= 2, so we
+            >>>     # constrain the range to at least 2
+            >>>     if res.shape[0] <= 1:
+            >>>         raise RuntimeError("not supported")
+            >>>     return torch.tensor(res, device=x.device)
+        """
+        def inner(f):
+            self._check_doesnt_have_library_meta_impl()
+            self._register_impl("abstract", f, stacklevel=_stacklevel)
+            location = self._get_impl("abstract").location
+            qualname = self._qualname
+            # Handle DispatchKey.Meta registration
+            @functools.wraps(f)
+            def f_with_ctx(*args, **kwargs):
+                def error_on_ctx():
+                    raise RuntimeError(
+                        f"Attempted to call get_ctx() for the meta implementation "
+                        f"for {qualname}."
+                        f"You have presumably called get_ctx() because the operator "
+                        f"has a data-dependent output shape; if so, there is no "
+                        f"such meta implementation and this error is the correct "
+                        f"behavior. Otherwise, please remove the call to get_ctx() "
+                        f"in the implementation registered with impl_abstract "
+                        f"at {location}"
+                    )
+                with torch._library.abstract_impl.set_ctx_getter(error_on_ctx):
+                    return f(*args, **kwargs)
+            self._lib.impl(self._opname, f_with_ctx, "Meta")
+            return f
+        return inner
+    def _check_can_register_backward(self):
+        def error(detail):
+            raise RuntimeError(
+                f"Cannot use torch._custom_ops APIs to register backward "
+                f"formula for {detail}. Got operator "
+                f"{self._qualname} with schema: {schema}"
+            )
+        schema = self._schema
+        if schema.kind() != SchemaKind.functional:
+            error("non-functional operator")
+        rets = schema.returns
+        if not schema.returns:
+            error("operator with no returns")
+        assert len(rets) > 0
+        is_non_mutating_view = any(
+            r.annotation is not None and not r.annotation.is_write for r in rets
+        )
+        if is_non_mutating_view:
+            error("operator that returns views")
+        # We make assumptions about the schema's return types.
+        allowed_return_types = {
+            BaseType(BaseTy.int): "int",
+            BaseType(BaseTy.SymInt): "SymInt",
+            BaseType(BaseTy.bool): "bool",
+            BaseType(BaseTy.float): "float",
+            BaseType(BaseTy.Tensor): "Tensor",
+            ListType(BaseType(BaseTy.Tensor), None): "List[Tensor]",
+        }
+        for ret in schema.returns:
+            if ret.type in allowed_return_types:
+                continue
+            error(f"operator with return not in {list(allowed_return_types.values())} (got {ret.type})")
+    def _check_doesnt_have_library_autograd_impl(self):
+        if self._registered_autograd_kernel_indirection:
+            return
+        if _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "CompositeImplicitAutograd"):
+            raise RuntimeError(
+                f"impl_backward/impl_save_for_backward: the operator {self._qualname} "
+                f"already has an implementation for this device type via a "
+                f"pre-existing registration to DispatchKey::CompositeImplicitAutograd."
+                f"CompositeImplicitAutograd operators do not need an autograd formula; "
+                f"instead, the operator will decompose into its constituents and those "
+                f"can have autograd formulas defined on them.")
+        # We can improve this by adding "all Autograd<BACKEND> keys", but
+        # realistically people will just be using this API for CPU/CUDA for now.
+        for key in ["Autograd", "AutogradCPU", "AutogradCUDA"]:
+            if _C._dispatch_has_kernel_for_dispatch_key(self._qualname, key):
+                raise RuntimeError(
+                    f"impl_backward/impl_save_for_backward: "
+                    f"the operator {self._qualname} already has an Autograd kernel "
+                    f"registered to DispatchKey::{key} vi a pre-existing "
+                    f"torch.library or TORCH_LIBRARY registration. Please either "
+                    f"remove those registrations or don't use the torch._custom_ops APIs")
+    def _check_doesnt_have_library_meta_impl(self):
+        if self._has_impl("abstract"):
+            return
+        # If the user's operator is CompositeExplicitAutograd,
+        # allow them to impl_abstract. This is being pragmatic
+        # (existing custom ops may have CompositeExplicitAutograd
+        # registration that don't work with Meta kernels, so this
+        # gives them an escape hatch).
+        if (
+            _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "CompositeExplicitAutograd")
+            and not _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "Meta")
+        ):
+            return
+        # Otherwise, if the user's already has a Meta kernel or their
+        # op is CompositeImplicitAutograd or some other alias dispatch key,
+        # raise.
+        # Special case for CompositeImplicitAutograd
+        if _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "CompositeImplicitAutograd"):
+            raise RuntimeError(
+                f"impl_abstract(...): the operator {self._qualname} "
+                f"already has an implementation for this device type via a "
+                f"pre-existing registration to DispatchKey::CompositeImplicitAutograd."
+                f"CompositeImplicitAutograd operators do not need an abstract impl; "
+                f"instead, the operator will decompose into its constituents and those "
+                f"can have abstract impls defined on them.")
+        if _C._dispatch_has_kernel_for_dispatch_key(self._qualname, "Meta"):
+            raise RuntimeError(
+                f"impl_abstract(...): the operator {self._qualname} "
+                f"already has an DispatchKey::Meta implementation via a "
+                f"pre-existing torch.library or TORCH_LIBRARY registration. "
+                f"Please either remove that registration or don't call impl_abstract.")
+    # NOTE ["backward", "save_for_backward", and "autograd"]
+    # As a part of the explicit autograd API, a user must provide us
+    # a "save_for_backward" function and a "backward" function.
+    # When both of these have been provided, then we automatically
+    # construct the "autograd" kernel.
+    def _register_autograd_kernel(self):
+        assert self._has_impl("backward")
+        assert self._has_impl("save_for_backward")
+        kernel = construct_autograd_kernel(
+            self._schema,
+            self._output_differentiability,
+            self,
+            get_op(self._qualname),
+            self._get_impl("save_for_backward").func,
+            self._get_impl("backward").func)
+        self._register_impl("autograd", kernel)
+    def impl_save_for_backward(self, _stacklevel=2):
+        r"""Register a function that tells us what to save for backward.
+        Please see impl_backward for more details.
+        """
+        def inner(f):
+            self._check_can_register_backward()
+            self._check_doesnt_have_library_autograd_impl()
+            if not self._registered_autograd_kernel_indirection:
+                self._register_autograd_kernel_indirection()
+            self._register_impl("save_for_backward", f, stacklevel=_stacklevel)
+            if self._has_impl("backward"):
+                self._register_autograd_kernel()
+        return inner
+    def impl_backward(self, output_differentiability=None, _stacklevel=2):
+        r"""Registers a backward formula.
+        WARNING: if you're a user, please do not use this directly
+        (instead use the torch._custom_ops APIs).
+        Also please see the following for a detailed guide on custom ops.
+        https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+        In order for the CustomOp to work with autograd, you need to register
+        a backward formula. There are two pieces to this:
+        1. You must give us a function to specify what to save for backward.
+           Call this the "save for backward" function.
+        2. You must give us a function that computes gradients. Call this the
+           "backward" function.
+        Use `impl_save_for_backward` to define a "save for backward" function
+        that specifies what gets saved for backward. The function should accept
+        two arguments ``(inputs, output)`` and return the quantities to be saved
+        for backward.
+        During runtime, when you call the CustomOp, PyTorch will invoke the
+        "save for backward" function with the inputs and output of the CustomOp.
+        Use `impl_backward` to define the "backward" function. The backward
+        function must accept ``(ctx, saved, *grads)``:
+        - ``ctx`` is a context object where we may provide information
+        - ``saved`` is exactly what gets returned from the "save for backward"
+          function
+        - ``grads`` is one or more gradients. The number of gradients matches
+          the number of outputs of the CustomOp.
+        The backward function must return a dict that maps the name of
+        an input to the CustomOp to its corresponding gradient. All inputs that
+        were declared to be Tensors in the CustomOp definition must be accounted
+        for in the dict. The gradient may be a Tensor or None.
+        """
+        if output_differentiability is not None:
+            def yell():
+                raise RuntimeError(
+                    f"impl_backward(output_differentiability): expected "
+                    f"output_differentiability to be a list of bools with "
+                    f"length equal to the number of outputs of this CustomOp "
+                    f"got: {output_differentiability}")
+            if not isinstance(output_differentiability, list):
+                yell()
+            for diff in output_differentiability:
+                if not isinstance(diff, bool):
+                    yell()
+            if len(self._schema.returns) != len(output_differentiability):
+                yell()
+        def inner(f):
+            self._check_can_register_backward()
+            self._check_doesnt_have_library_autograd_impl()
+            if not self._registered_autograd_kernel_indirection:
+                self._register_autograd_kernel_indirection()
+            self._register_impl("backward", f, stacklevel=_stacklevel)
+            self._output_differentiability = output_differentiability
+            if self._has_impl("save_for_backward"):
+                self._register_autograd_kernel()
+        return inner
+@dataclasses.dataclass
+class FuncAndLocation:
+    func: typing.Callable
+    location: str
+def find_ophandle_or_throw(cpp_ns: str, operator_name: OperatorName):
+    overload_name = (
+        "" if operator_name.overload_name is None else operator_name.overload_name
+    )
+    return _C._dispatch_find_schema_or_throw(
+        f"{cpp_ns}::{str(operator_name.name)}", overload_name
+    )
+def validate_namespace(ns: str) -> None:
+    if "." in ns:
+        raise ValueError(
+            f'custom_op(..., ns="{ns}"): expected ns to not contain any . (and be a '
+            f"valid variable name)"
+        )
+    if ns in RESERVED_NS:
+        raise ValueError(
+            f"custom_op(..., ns='{ns}'): '{ns}' is a reserved namespace, "
+            f"please choose something else. "
+        )
+def validate_schema(schema: FunctionSchema) -> None:
+    if not torch._library.utils.is_functional_schema(schema):
+        raise ValueError(
+            f"custom_op only supports functional operators "
+            f"(ops that do not mutate any inputs, do not return "
+            f"views of the inputs, and has at least one return). "
+            f"Got the following non-functional schema: {schema}"
+        )
+    # For simplicity: don't allow self arguments
+    if schema.arguments.self_arg is not None:
+        raise ValueError(
+            f"custom_op does not support arguments named 'self'. Please "
+            f"rename your argument. Got: {schema}"
+        )
+def parse_qualname(qualname: str) -> typing.Tuple[str, str]:
+    names = qualname.split("::", 1)
+    if len(names) != 2:
+        raise ValueError(f"Expected there to be a namespace in {qualname}, i.e. The "
+                         f"operator name should look something like ns::foo")
+    if '.' in names[1]:
+        raise ValueError(f"The torch.custom_ops APIs do not handle overloads, "
+                         f"i.e. operator names with '.' in them. "
+                         f"Please name your operator something like ns::foo. "
+                         f"Got: {qualname}")
+    return names[0], names[1]
+def validate_device_type(device_type: str) -> None:
+    if device_type not in SUPPORTED_DEVICE_TYPE_TO_KEY:
+        raise ValueError(
+            f"CustomOp.impl(device_types=[{device_type}, ...]): we only support device_type "
+            f"in {SUPPORTED_DEVICE_TYPE_TO_KEY.keys()}."
+        )
+def supported_param(param: inspect.Parameter) -> bool:
+    return param.kind in (
+        inspect.Parameter.POSITIONAL_OR_KEYWORD,
+        inspect.Parameter.KEYWORD_ONLY,
+    )
+def validate_function_matches_schema(
+    schema: FunctionSchema, func: typing.Callable
+) -> None:
+    sig = inspect.signature(func)
+    if not all(supported_param(p) for _, p in sig.parameters.items()):
+        raise ValueError(
+            f"custom_op(..., manual_schema)(func): positional-only args, "
+            f"varargs, and kwargs are not supported. Please rewrite `func` "
+            f"to not have them. Got `func` with signature: {sig}"
+        )
+    if (
+        any(
+            p.annotation is not inspect.Parameter.empty
+            for _, p in sig.parameters.items()
+        )
+        or sig.return_annotation is not inspect.Signature.empty
+    ):
+        raise ValueError(
+            f"custom_op(..., manual_schema)(func): When passing in a manual "
+            f"schema, we expect `func` to have no type annotations to avoid "
+            f"ambiguity. Got `func` with signature: {sig}"
+        )
+    positional = [
+        (name, param)
+        for name, param in sig.parameters.items()
+        if param.kind == inspect.Parameter.POSITIONAL_OR_KEYWORD
+    ]
+    kwargonly = [
+        (name, param)
+        for name, param in sig.parameters.items()
+        if param.kind == inspect.Parameter.KEYWORD_ONLY
+    ]
+    def error():
+        raise ValueError(
+            f"custom_op(..., manual_schema)(func): When passing in a manual "
+            f"schema, we expect `func`'s signature to match `manual_schema` "
+            f"(aside from type annotations). "
+            f"func's signature: {sig}, manual_schema: {schema}"
+        )
+    def error_default_args():
+        raise ValueError(
+            f"custom_op(..., manual_schema)(func): "
+            f"neither func nor manual_schema should have default "
+            f"arguments. Got "
+            f"func's signature: {sig}, manual_schema: {schema}"
+        )
+    def compare(sig_args, schema_args):
+        if len(sig_args) != len(schema_args):
+            error()
+        for (name, param), arg in zip(sig_args, schema_args):
+            if name != arg.name:
+                error()
+            if param.default is not inspect.Parameter.empty or arg.default is not None:
+                error_default_args()
+    compare(positional, schema.arguments.flat_positional)
+    compare(kwargonly, schema.arguments.flat_kwarg_only)
+def infer_schema(prototype_function: typing.Callable) -> str:
+    sig = inspect.signature(prototype_function)
+    def error_fn(what):
+        raise ValueError(
+            f"custom_op(...)(func): {what} " f"Got func with signature {sig})"
+        )
+    params = [
+        parse_param(name, param, error_fn) for name, param in sig.parameters.items()
+    ]
+    ret = parse_return(sig.return_annotation, error_fn)
+    return f"({', '.join(params)}) -> {ret}"
+def parse_param(name, param, error_fn):
+    if not supported_param(param):
+        error_fn("We do not support positional-only args, varargs, or varkwargs.")
+    if param.annotation is inspect.Parameter.empty:
+        error_fn(f"Parameter {name} must have a type annotation.")
+    if param.annotation not in SUPPORTED_PARAM_TYPES.keys():
+        error_fn(
+            f"Parameter {name} has unsupported type {param.annotation}. "
+            f"The valid types are: {SUPPORTED_PARAM_TYPES.keys()}."
+        )
+    if param.default is not inspect.Parameter.empty:
+        error_fn(
+            f"Parameter {name} has a default value; this is not supported. "
+            f"If you want to use default values then create a function with "
+            f"default values that calls the CustomOp"
+        )
+    return f"{SUPPORTED_PARAM_TYPES[param.annotation]} {name}"
+def derived_types(
+    base_type, cpp_type, list_base, optional_base_list, optional_list_base
+):
+    result = [
+        (base_type, cpp_type),
+        (typing.Optional[base_type], f"{cpp_type}?"),
+    ]
+    if list_base:
+        result.append((typing.Sequence[base_type], f"{cpp_type}[]"))  # type: ignore[valid-type]
+    if optional_base_list:
+        result.append((typing.Sequence[typing.Optional[base_type]], f"{cpp_type}?[]"))  # type: ignore[valid-type]
+    if optional_list_base:
+        result.append((typing.Optional[typing.Sequence[base_type]], f"{cpp_type}[]?"))  # type: ignore[valid-type]
+    return result
+def get_supported_param_types():
+    data = [
+        # (python type, schema type, type[] variant, type?[] variant, type[]? variant
+        (torch.Tensor, "Tensor", True, True, False),
+        (int, "SymInt", True, False, True),
+        (float, "float", True, False, True),
+        (bool, "bool", True, False, True),
+        (str, "str", False, False, False),
+        (torch.types.Number, "Scalar", True, False, False),
+        (torch.dtype, "ScalarType", False, False, False),
+        (torch.device, "Device", False, False, False),
+    ]
+    result = []
+    for line in data:
+        result.extend(derived_types(*line))
+    return dict(result)
+SUPPORTED_RETURN_TYPES = {
+    torch.Tensor: "Tensor",
+    typing.List[torch.Tensor]: "Tensor[]",
+    int: "SymInt",
+    float: "float",
+    bool: "bool",
+    torch.types.Number: "Scalar",
+}
+def parse_return(annotation, error_fn):
+    origin = typing.get_origin(annotation)
+    if origin is not tuple:
+        if annotation not in SUPPORTED_RETURN_TYPES.keys():
+            error_fn(
+                f"Return has unsupported type {annotation}. "
+                f"The valid types are: {SUPPORTED_RETURN_TYPES}."
+            )
+        return SUPPORTED_RETURN_TYPES[annotation]
+    args = typing.get_args(annotation)
+    for arg in args:
+        if arg not in SUPPORTED_RETURN_TYPES:
+            error_fn(
+                f"Return has unsupported type {annotation}. "
+                f"The valid types are: {SUPPORTED_RETURN_TYPES}."
+            )
+    return "(" + ", ".join([SUPPORTED_RETURN_TYPES[arg] for arg in args]) + ")"
+SUPPORTED_PARAM_TYPES = get_supported_param_types()
+def report_error_callback(custom_op: typing.Any, key: str) -> None:
+    if key == "Undefined":
+        raise NotImplementedError(
+            f"{custom_op}: There were no Tensor inputs to this operator "
+            f"(e.g. you passed an empty list of Tensors). If your operator is a "
+            f"factory function (that is, it takes no Tensors and constructs "
+            f"a new one), then please use CustomOp.impl_factory to register "
+            f"an implementation for it"
+        )
+    if key == "Meta":
+        raise NotImplementedError(
+            f"{custom_op}: when running with device='Meta' tensors: there is no "
+            f"abstract impl registered for this CustomOp. Please register one via "
+            f"CustomOp.impl_abstract to get this CustomOp to work with Meta tensors"
+        )
+    if key in ("CPU", "CUDA"):
+        device = key.lower()
+        raise NotImplementedError(
+            f"{custom_op}: when running with device='{device}' tensors: there is no "
+            f"{device} impl registered for this CustomOp. Please register one via "
+            f"CustomOp.impl(device_type='{device}')"
+        )
+    raise NotImplementedError(
+        f"{custom_op}: No implementation for dispatch key {key}. It is likely "
+        f"that we have not added this functionality yet, please either open an "
+        f"issue or if you're feeling adventurous, use the low-level "
+        f"torch.library API"
+    )
+def custom_op_from_existing(op):
+    ns = op.namespace
+    lib = torch.library.Library(ns, "FRAGMENT")
+    name = op.name().split("::")[-1]
+    schema_str = str(op._schema)
+    # CustomOp expects the schema string without the namespace
+    schema_str = schema_str.split("::")[-1]
+    schema = FunctionSchema.parse(schema_str)
+    return CustomOp(lib, ns, schema, name, op, _private_access=True)
+def get_op(qualname):
+    def error_not_found():
+        raise ValueError(
+            f"Could not find the operator {qualname}. Please make sure you have "
+            f"already registered the operator and (if registered from C++) "
+            f"loaded it via torch.ops.load_library.")
+    ns, name = parse_qualname(qualname)
+    if not hasattr(torch.ops, ns):
+        error_not_found()
+    opnamespace = getattr(torch.ops, ns)
+    if not hasattr(opnamespace, name):
+        error_not_found()
+    packet = getattr(opnamespace, name)
+    if not hasattr(packet, 'default'):
+        error_not_found()
+    return packet.default
+def _find_custom_op(qualname, also_check_torch_library=False):
+    if qualname in global_registry:
+        return global_registry[qualname]
+    if not also_check_torch_library:
+        raise RuntimeError(
+            f"Could not find custom op \"{qualname}\". Did you register it via "
+            f"the torch._custom_ops API?")
+    overload = get_op(qualname)
+    result = custom_op_from_existing(overload)
+    return result
+def get_abstract_impl(qualname):
+    if qualname not in torch._custom_op.impl.global_registry:
+        return None
+    custom_op = torch._custom_op.impl.global_registry[qualname]
+    if custom_op is None:
+        return None
+    if not custom_op._has_impl("abstract"):
+        return None
+    return custom_op._get_impl("abstract").func
+def _custom_op_with_schema(qualname, schema):
+    ns, name = qualname.split("::")
+    schema_str = f"{name}{schema}"
+    function_schema = FunctionSchema.parse(schema_str)
+    validate_schema(function_schema)
+    lib = library.Library(ns, "FRAGMENT")
+    lib.define(schema_str)
+    ophandle = find_ophandle_or_throw(ns, function_schema.name)
+    result = CustomOp(lib, ns, function_schema, name, ophandle, _private_access=True)
+    result._register_autograd_kernel_indirection()
+    torch._C._dispatch_set_report_error_callback(
+        ophandle, functools.partial(report_error_callback, weakref.proxy(result))
+    )
+    return get_op(qualname)

torch/_custom_ops.py ADDED Viewed

	@@ -0,0 +1,322 @@

+import inspect
+from torch._custom_op.impl import (
+    _custom_op_with_schema,
+    _find_custom_op,
+    infer_schema,
+    parse_qualname,
+    validate_namespace,
+)
+from torch.library import get_ctx
+__all__ = [
+    "custom_op",
+    "impl",
+    "impl_abstract",
+    "get_ctx",
+    "impl_save_for_backward",
+    "impl_backward",
+]
+def custom_op(qualname, func_or_schema=None):
+    r"""Register a new custom operator
+    In PyTorch, defining an op (short for "operator") is a two step-process:
+    - we need to define the op (by providing an operator name and schema)
+    - we need to implement behavior for how the operator interacts with
+      various PyTorch subsystems, like CPU/CUDA Tensors, Autograd, etc.
+    This entrypoint defines the custom operator (the first step)
+    you must then perform the second step by calling various
+    ``impl_*`` APIs.
+    This API may be used as a decorator (see examples).
+    For a detailed guide on custom ops, please see
+    https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+    Arguments:
+        qualname (str): Should be a string that looks like
+            "namespace::operator_name". Operators in PyTorch need a namespace to
+            avoid name collisions; a given operator may only be created once.
+            If you are writing a Python library, we recommend the namespace to
+            be the name of your top-level module.
+        func_or_schema (Union[Callable, str]): Each PyTorch operator needs a
+            schema that tells PyTorch the types of the inputs/outputs.
+            If this is a Callable, we will automatically infer the schema from
+            the type annotations on the function (see examples). Otherwise,
+            if you don't want to use type annotations, you may provide us the
+            schema string.
+    Example::
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> import torch
+        >>> import numpy as np
+        >>> from torch import Tensor
+        >>>
+        >>> # Step 1: define the custom op.
+        >>> # We need to provide the API a "prototype function"
+        >>> # (a function that returns NotImplementedError), from which
+        >>> # we will infer the types of the inputs and outputs.
+        >>> @torch._custom_ops.custom_op("mylibrary::numpy_sin")
+        >>> def numpy_sin(x: Tensor) -> Tensor:
+        >>>     raise NotImplementedError()
+        >>>
+        >>> # The custom op is now accessible via the torch.ops module:
+        >>> torch.ops.mylibrary.numpy_sin
+        >>>
+        >>> # Step 2: Register an implementation for various PyTorch subsystems
+        >>>
+        >>> # Register an implementation for CPU tensors
+        >>> @torch._custom_ops.impl("mylibrary::numpy_sin", device_types="cpu")
+        >>> def numpy_sin_impl_cpu(x):
+        >>>     return torch.from_numpy(np.sin(x.numpy()))
+        >>>
+        >>> # Register an implementation for CUDA tensors
+        >>> @torch._custom_ops.impl("mylibrary::numpy_sin", device_types="cuda")
+        >>> def numpy_sin_impl_cuda(x):
+        >>>     return torch.from_numpy(np.sin(x.cpu().numpy())).to(x.device)
+        >>>
+        >>> x = torch.randn(3)
+        >>> torch.ops.mylibrary.numpy_sin(x)  # calls numpy_sin_impl_cpu
+        >>>
+        >>> x_cuda = x.cuda()
+        >>> torch.ops.mylibrary.numpy_sin(x)  # calls numpy_sin_impl_cuda
+    """
+    ns, name = parse_qualname(qualname)
+    validate_namespace(ns)
+    def inner(func):
+        if not inspect.isfunction(func):
+            raise ValueError(
+                f"custom_op(...)(func): Expected `func` to be a Python "
+                f"function, got: {type(func)}"
+            )
+        if func.__name__ != name:
+            raise ValueError(
+                f"custom_op(qualname='{qualname}', ...)(func): expected `func` "
+                f"to have name '{name}' but got '{func.__name__}'. "
+                f"Please either change the name of `func` or the qualname that "
+                f"is passed to `custom_op`"
+            )
+        schema = infer_schema(func)
+        _custom_op_with_schema(qualname, schema)
+        return func
+    if func_or_schema is None:
+        return inner
+    if isinstance(func_or_schema, str):
+        _custom_op_with_schema(qualname, func_or_schema)
+    else:
+        return inner(func_or_schema)
+def impl(qualname, *, device_types=("cpu", "cuda"), func=None):
+    r"""Register an implementation for a device type for this custom op.
+    If the op is passed multiple Tensor inputs with different device
+    types, it will dispatch to the registered implementation for the highest
+    priority device type among those present.
+    The supported device types, in order of priority, are {'cuda', 'cpu'}.
+    This API may be used as a decorator (see examples).
+    For a detailed guide on custom ops, please see
+    https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+    Arguments:
+        device_types (str or Iterable[str]): the device type(s) to register the function for.
+    Example::
+        >>> # xdoctest: +REQUIRES(env:TORCH_DOCTEST_CUDA)
+        >>> import torch
+        >>> import numpy as np
+        >>> from torch import Tensor
+        >>>
+        >>> # Step 1: define the custom op.
+        >>> # We need to provide the API a "prototype function"
+        >>> # (a function that returns NotImplementedError), from which
+        >>> # we will infer the types of the inputs and outputs.
+        >>> @torch._custom_ops.custom_op("mylibrary::numpy_cos")
+        >>> def numpy_cos(x: Tensor) -> Tensor:
+        >>>     raise NotImplementedError()
+        >>>
+        >>> # The custom op is now accessible via the torch.ops module:
+        >>> torch.ops.mylibrary.numpy_cos
+        >>>
+        >>> # Step 2: Register an implementation for various PyTorch subsystems
+        >>>
+        >>> # Register an implementation for CPU tensors
+        >>> @torch._custom_ops.impl("mylibrary::numpy_cos", device_types="cpu")
+        >>> def numpy_cos_impl_cpu(x):
+        >>>     return torch.from_numpy(np.cos(x.numpy()))
+        >>>
+        >>> # Register an implementation for CUDA tensors
+        >>> @torch._custom_ops.impl("mylibrary::numpy_cos", device_types="cuda")
+        >>> def numpy_cos_impl_cuda(x):
+        >>>     return torch.from_numpy(np.cos(x.cpu().numpy())).to(x.device)
+        >>>
+        >>> x = torch.randn(3)
+        >>> torch.ops.mylibrary.numpy_cos(x)  # calls numpy_cos_impl_cpu
+        >>>
+        >>> x_cuda = x.cuda()
+        >>> torch.ops.mylibrary.numpy_cos(x)  # calls numpy_cos_impl_cuda
+    """
+    def inner(func):
+        custom_op = _find_custom_op(qualname, also_check_torch_library=True)
+        custom_op.impl(device_types, _stacklevel=3)(func)
+        return func
+    if func is None:
+        return inner
+    return inner(func)
+def impl_abstract(qualname, *, func=None):
+    r"""Register an abstract implementation for this operator.
+    An "abstract implementation" specifies the behavior of this operator on
+    Tensors that carry no data. Given some input Tensors with certain properties
+    (sizes/strides/storage_offset/device), it specifies what the properties of
+    the output Tensors are.
+    The abstract implementation has the same signature as the operator.
+    It is run for both FakeTensors and meta tensors. To write an abstract
+    implementation, assume that all Tensor inputs to the operator are
+    regular CPU/CUDA/Meta tensors, but they do not have storage, and
+    you are trying to return regular CPU/CUDA/Meta tensor(s) as output.
+    The abstract implementation must consist of only PyTorch operations
+    (and may not directly access the storage or data of any input or
+    intermediate Tensors).
+    This API may be used as a decorator (see examples).
+    For a detailed guide on custom ops, please see
+    https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+    Examples::
+        >>> import numpy as np
+        >>> from torch import Tensor
+        >>>
+        >>> # Example 1: an operator without data-dependent output shape
+        >>> @torch._custom_ops.custom_op("mylibrary::custom_linear")
+        >>> def custom_linear(x: Tensor, weight: Tensor, bias: Tensor) -> Tensor:
+        >>>     raise NotImplementedError()
+        >>>
+        >>> @torch._custom_ops.impl_abstract("mylibrary::custom_linear")
+        >>> def custom_linear_abstract(x, weight):
+        >>>     assert x.dim() == 2
+        >>>     assert weight.dim() == 2
+        >>>     assert bias.dim() == 1
+        >>>     assert x.shape[1] == weight.shape[1]
+        >>>     assert weight.shape[0] == bias.shape[0]
+        >>>     assert x.device == weight.device
+        >>>
+        >>>     return (x @ weight.t()) + bias
+        >>>
+        >>> # Example 2: an operator with data-dependent output shape
+        >>> @torch._custom_ops.custom_op('mylibrary::custom_nonzero')
+        >>> def custom_nonzero(x: Tensor) -> Tensor:
+        >>>     ...
+        >>>
+        >>> @torch._custom_ops.impl_abstract("mylibrary::custom_nonzero")
+        >>> def custom_nonzero_abstract(x):
+        >>>     # Number of nonzero-elements is data-dependent.
+        >>>     # Since we cannot peek at the data in an abstract impl,
+        >>>     # we use the ctx object to construct a new symint that
+        >>>     # represents the data-dependent size.
+        >>>     ctx = torch._custom_ops.get_ctx()
+        >>>     nnz = ctx.create_unbacked_symint()
+        >>>     shape = [x.dim(), nnz]
+        >>>     result = x.new_empty(shape, dtype=torch.long)
+        >>>     return result
+        >>>
+        >>> @torch._custom_ops.impl("mylibrary::custom_nonzero")
+        >>> def custom_nonzero_impl(x):
+        >>>     x_np = to_numpy(x)
+        >>>     res = np.stack(np.nonzero(x_np), axis=1)
+        >>>     # unbacked symbolic ints in PyTorch must be >= 2, so we
+        >>>     # constrain the range to at least 2
+        >>>     if res.shape[0] <= 1:
+        >>>         raise RuntimeError("not supported")
+        >>>     return torch.tensor(res, device=x.device)
+    """
+    import torch.library
+    return torch.library.impl_abstract(qualname, func, _stacklevel=2)
+def impl_save_for_backward(qualname, *, func=None):
+    r"""Register a function that tells us what to save for backward.
+    Please see :func:`impl_backward` for more details.
+    """
+    def inner(func):
+        custom_op = _find_custom_op(qualname, also_check_torch_library=True)
+        custom_op.impl_save_for_backward(_stacklevel=3)(func)
+        return func
+    if func is None:
+        return inner
+    return inner(func)
+def impl_backward(qualname, output_differentiability=None, *, func=None):
+    r"""Registers a backward formula for an operator.
+    In order for an operator to work with autograd, you need to register
+    a backward formula. There are two pieces to this:
+    1. You must give us a function to specify what to save for backward.
+       Call this the "save for backward" function.
+    2. You must give us a function that computes gradients. Call this the
+       "backward" function.
+    Use `impl_save_for_backward` to define a "save for backward" function
+    that specifies what gets saved for backward. The function should accept
+    two arguments ``(inputs, output)`` and return the quantities to be saved
+    for backward.
+    During runtime, when you call the operator in a forwards pass, PyTorch
+    will invoke the "save for backward" function with the inputs and output
+    of the operator.
+    Use `impl_backward` to define the "backward" function. The backward
+    function must accept ``(ctx, saved, *grads)``:
+    - ``ctx`` is a context object where we may provide information
+    - ``saved`` is exactly what gets returned from the "save for backward"
+      function
+    - ``grads`` is one or more gradients. The number of gradients matches
+      the number of outputs of the operator.
+    The backward function must return a dict that maps the name of
+    an input to the operator to its corresponding gradient. All inputs that
+    were declared to be Tensors in the operator definition must be accounted
+    for in the dict. The gradient may be a Tensor or None.
+    For a detailed guide on custom ops, please see
+    https://docs.google.com/document/d/1aGWtgxV3HppuxQAdddyPrs74_aEntpkYt9MalnCKnhk
+    """
+    def inner(func):
+        custom_op = _find_custom_op(qualname, also_check_torch_library=True)
+        custom_op.impl_backward(output_differentiability, _stacklevel=3)(func)
+        return func
+    if func is None:
+        return inner
+    return inner(func)
+def _destroy(qualname):
+    """De-registers a custom op. For testing purposes only"""
+    custom_op = _find_custom_op(qualname)
+    custom_op._destroy()

torch/_decomp/__init__.py ADDED Viewed

	@@ -0,0 +1,444 @@

+import inspect
+from collections import defaultdict
+from functools import wraps
+from itertools import chain
+from typing import Callable, Dict, List, Sequence, Union
+import torch
+import torch.library
+from torch._ops import HigherOrderOperator, OpOverload, OpOverloadPacket
+from torch._prims_common import CustomOutParamAnnotation
+from torch.utils import _pytree as pytree
+__all__ = [
+    "decomposition_table",
+    "pre_autograd_decomposition_table",
+    "meta_table",
+    "register_decomposition",
+    "get_decompositions",
+    "core_aten_decompositions",
+]
+# TODO: relax key type here; torch registrations should be possible to; but
+# right now this type is accurate
+global_decomposition_table: Dict[
+    str, Dict[torch._ops.OperatorBase, Callable]
+] = defaultdict(dict)
+decomposition_table = global_decomposition_table["post_autograd"]
+pre_autograd_decomposition_table = global_decomposition_table["pre_autograd"]
+meta_table = global_decomposition_table["meta"]
+def _add_op_to_registry(registry, op, fn):
+    """
+    This is an internal API for adding an op to the decomposition table.
+    If op is OpOverload, it will be added to the registry directly.
+    If op is OpOverloadPacket, all the valid op_overloads in the packet will be added to the registry.
+    """
+    overloads: List[Union[torch._ops.OperatorBase]] = []
+    if isinstance(op, HigherOrderOperator):
+        # There's no concept of overloads for HigherOrderOperator
+        registry[op] = fn
+        return
+    elif isinstance(op, OpOverload):
+        overloads.append(op)
+    else:
+        assert isinstance(op, OpOverloadPacket)
+        for ol in op.overloads():
+            overloads.append(getattr(op, ol))
+    for op_overload in overloads:
+        if op_overload in registry:
+            raise RuntimeError(f"duplicate registrations for {op_overload}")
+        # TorchScript dumps a bunch of extra nonsense overloads
+        # which don't have corresponding dispatcher entries, we need
+        # to filter those out, e.g aten.add.float_int
+        if torch._C._dispatch_has_kernel(op_overload.name()):
+            registry[op_overload] = fn
+def _convert_out_params(f):
+    out_annotation = f.__annotations__.get("out")
+    # If there are no out params, do not wrap the function.
+    if not out_annotation:
+        return f
+    # Hack to detect when out is a Tuple. There seems to be no pretty way of doing this
+    if getattr(out_annotation, "__origin__", None) is tuple:
+        sig = inspect.signature(f)
+        out_names = sig.return_annotation._fields
+        # If out is a tuple, we need to register a function that unpacks all the out
+        # elements as this is what native_functions.yaml expects
+        @wraps(f)
+        def _fn(*args, **kwargs):
+            out_kwargs = tuple(kwargs.pop(o, None) for o in out_names)
+            # Either all of the out kwargs are set or none of them
+            is_none = out_kwargs[0] is None
+            assert all((o is None) == is_none for o in out_kwargs)
+            return f(*args, **kwargs, out=None if is_none else out_kwargs)
+        out_params = [
+            inspect.Parameter(
+                o,
+                kind=inspect.Parameter.KEYWORD_ONLY,
+                default=None,
+                annotation=t,
+            )
+            for o, t in zip(out_names, out_annotation.__args__)
+        ]
+        # Drop the out parameter and concatenate the new kwargs in the signature
+        params = chain((v for k, v in sig.parameters.items() if k != "out"), out_params)
+        _fn.__signature__ = inspect.Signature(  # type: ignore[attr-defined]
+            parameters=params, return_annotation=sig.return_annotation  # type: ignore[arg-type]
+        )
+        # Drop the out parameter and concatenate the new kwargs in the annotations
+        _fn.__annotations__ = {k: v for k, v in f.__annotations__.items() if k != "out"}
+        for o in out_params:
+            _fn.__annotations__[o.name] = o.annotation
+        # Propagate that this function is wrapped by `out_wrapper`
+        _fn._torch_decompositions_out_wrapper = f._torch_decompositions_out_wrapper  # type: ignore[attr-defined]
+        return _fn
+    # Alternatively, there may be a single tensor out parameter with a name
+    # other than "out". This will need special treatment and is indicated by an
+    # annotation, which we will remove here so it is not exposed after wrapping.
+    custom_out_param_name = f.__annotations__.pop(CustomOutParamAnnotation, None)
+    if custom_out_param_name:
+        @wraps(f)
+        def _fn(*args, **kwargs):
+            out_kwarg = kwargs.pop(custom_out_param_name, None)
+            return f(*args, **kwargs, out=out_kwarg)
+        out_param = inspect.Parameter(
+            custom_out_param_name,
+            kind=inspect.Parameter.KEYWORD_ONLY,
+            default=None,
+            annotation=out_annotation,
+        )
+        # Drop the out parameter and concatenate the new kwarg in the signature
+        sig = inspect.signature(f)
+        params = chain(
+            (v for k, v in sig.parameters.items() if k != "out"), (out_param,)
+        )
+        _fn.__signature__ = inspect.Signature(  # type: ignore[attr-defined]
+            parameters=params, return_annotation=sig.return_annotation  # type: ignore[arg-type]
+        )
+        # Drop the out parameter and concatenate the new kwargs in the annotations
+        _fn.__annotations__ = {k: v for k, v in f.__annotations__.items() if k != "out"}
+        _fn.__annotations__[out_param.name] = out_param.annotation
+        return _fn
+    return f
+def register_decomposition(
+    aten_op, registry=None, *, type="post_autograd", unsafe=False
+):
+    """
+    A decorator to register a function as a decomposition to the Python
+    decomposition table.  Use it like this::
+        @register_decomposition(torch.ops.aten.clamp_min)
+        def clamp_min(x):
+            return torch.clamp(self, min=min)
+    If you are writing a new decomposition, consider contributing it
+    directly to PyTorch in torch._decomp.decompositions.
+    This API is experimental; we are almost certainly going to extend
+    the API when we make decompositions eligible for use in transforms (e.g.,
+    autograd) and not just backend tracing, where we then need to know if a
+    decomposition can be used to simulate a transform.
+    By default, we also will register it to the Meta key of dispatcher,
+    and replace the c++ Meta implementation if there is already one.
+    unsafe kwarg is for reuse of this function for registering non-function
+    things
+    """
+    assert type in {"post_autograd", "pre_autograd", "meta"}
+    def decomposition_decorator(fn: Callable) -> Callable:
+        if not unsafe:
+            fn = _convert_out_params(fn)
+        nonlocal registry
+        if registry is None:
+            registry = global_decomposition_table[type]
+        def register(op):
+            _add_op_to_registry(registry, op, fn)
+        # To handle allowing multiple aten_ops at once
+        pytree.tree_map_(register, aten_op)
+        return fn
+    return decomposition_decorator
+def get_decompositions(
+    aten_ops: Sequence[Union[torch._ops.OperatorBase, OpOverloadPacket]],
+    type: str = "post_autograd",
+) -> Dict[torch._ops.OperatorBase, Callable]:
+    """
+    Retrieve a dictionary of decompositions corresponding to the list of
+    operator overloads and overload packets passed as input.  Overload
+    packets will include all decomposed overloads in the packet.  If there is
+    no decomposition for a requested operator, it is silently ignored.
+    This API is experimental; we are almost certainly going to give an alternate,
+    more recommended formulation, where a user provides the set of operators
+    they know how to implement, and we provide decompositions for everything
+    not in this set.
+    """
+    assert type in {"post_autograd", "pre_autograd", "meta"}
+    registry = global_decomposition_table[type]
+    packets_to_overloads = defaultdict(list)
+    for opo in registry:
+        if isinstance(opo, (OpOverload, OpOverloadPacket)):
+            packets_to_overloads[opo.overloadpacket].append(opo)
+    decompositions: Dict[torch._ops.OperatorBase, Callable] = {}
+    for op in aten_ops:
+        if isinstance(op, OpOverloadPacket) and op in packets_to_overloads:
+            for op_overload in packets_to_overloads[op]:
+                decompositions[op_overload] = registry[op_overload]
+        elif isinstance(op, (torch._ops.OperatorBase)) and op in registry:
+            decompositions[op] = registry[op]
+    return decompositions
+def remove_decompositions(
+    decompositions: Dict[torch._ops.OperatorBase, Callable],
+    aten_ops: Sequence[Union[OpOverload, OpOverloadPacket]],
+) -> None:
+    """
+    Given a dictionary of decompositions obtained from get_decompositions(), removes
+    operators associated with a list of operator overloads and overload packets passed
+    as input. If the decomposition dictionary does not contain a decomposition that is
+    specified to be removed, it is silently ignored.
+    """
+    for op in aten_ops:
+        if isinstance(op, OpOverloadPacket):
+            for overload_name in op.overloads():
+                opo = getattr(op, overload_name)
+                decompositions.pop(opo, None)
+        elif isinstance(op, OpOverload):
+            decompositions.pop(op, None)
+# populate the table
+import torch._decomp.decompositions
+import torch._refs
+# See NOTE [Core ATen Ops]
+#
+# list was copied from torch/_inductor/decomposition.py
+# excluding decompositions that results in prim ops
+# Resulting opset of decomposition is core aten ops
+def core_aten_decompositions() -> Dict[torch._ops.OperatorBase, Callable]:
+    aten = torch.ops.aten
+    return get_decompositions(
+        [
+            aten.addcdiv,
+            aten.addcdiv_,
+            aten.addcmul,
+            aten.addcmul_,
+            aten.addr,
+            aten.affine_grid_generator,
+            aten.all,
+            aten.aminmax,
+            aten.arange.default,
+            aten.arange.start,
+            aten.avg_pool2d_backward,
+            aten.baddbmm,
+            aten.binary_cross_entropy,
+            aten.binary_cross_entropy_backward,
+            aten.binary_cross_entropy_with_logits,
+            aten.celu,
+            aten.celu_,
+            aten.clamp_max,
+            aten.clamp_min,
+            aten.col2im,
+            aten.count_nonzero,
+            aten.cudnn_batch_norm,
+            aten.cudnn_batch_norm_backward,
+            aten.deg2rad,
+            aten.deg2rad_,
+            aten.detach,
+            aten.diag_embed,
+            aten.diagonal_backward,
+            aten.dot,
+            aten.vdot,
+            aten.elu,
+            aten.elu_,
+            aten.elu_backward,
+            aten._embedding_bag,
+            aten.embedding_dense_backward,
+            aten.empty_like,
+            aten._euclidean_dist.default,
+            aten.expand_as,
+            aten.eye,
+            aten.fill,
+            aten.fill_,
+            aten.floor_divide,
+            aten.frac,
+            aten.frac_,
+            aten._fused_moving_avg_obs_fq_helper,
+            aten.gelu_,
+            aten.gelu_backward,
+            aten.glu,
+            aten.glu_backward,
+            aten.hardshrink,
+            aten.hardsigmoid,
+            aten.hardsigmoid_,
+            aten.hardsigmoid_backward,
+            aten.hardswish,
+            aten.hardswish_,
+            aten.hardswish_backward,
+            aten.hardtanh_,
+            aten.hardtanh_backward,
+            aten.heaviside,
+            aten.heaviside_,
+            aten.huber_loss,
+            aten.huber_loss_backward,
+            aten.im2col,
+            aten.index_add,
+            aten.index_add_,
+            aten.index_copy,
+            aten.index_copy_,
+            aten.index_fill,
+            aten.index_fill_,
+            aten.isneginf,
+            aten.isposinf,
+            aten.l1_loss,
+            aten.leaky_relu_,
+            aten.leaky_relu_backward,
+            aten.lerp,
+            aten.lerp_,
+            aten.linspace,
+            aten.logaddexp,
+            aten.logaddexp2,
+            aten.logit,
+            aten.logit_,
+            aten.logit_backward,
+            aten.log_sigmoid_backward,
+            aten.log_sigmoid_forward,
+            aten._log_softmax_backward_data,
+            aten.logspace,
+            aten.logsumexp.default,
+            aten.masked_fill,
+            aten.masked_fill_,
+            aten.mish,
+            aten.mish_,
+            aten.mse_loss,
+            aten.mse_loss_backward,
+            aten.multi_margin_loss,
+            aten.multilabel_margin_loss_forward,
+            aten.mv,
+            aten.mvlgamma,
+            aten.mvlgamma_,
+            aten.nansum,
+            aten.nan_to_num,
+            aten.nan_to_num_,
+            aten.narrow,
+            aten.native_batch_norm_backward,
+            aten.native_dropout_backward,
+            aten.native_group_norm_backward,
+            aten.native_layer_norm_backward,
+            aten.new_empty,
+            aten.new_full,
+            aten.new_ones,
+            aten.new_zeros,
+            aten.nll_loss_backward,
+            aten.nll_loss_forward,
+            aten.norm,
+            aten.ones,
+            aten.ones_like,
+            aten._prelu_kernel,
+            aten._prelu_kernel_backward,
+            aten._reshape_alias,
+            aten.rad2deg,
+            aten.rad2deg_,
+            aten.renorm,
+            aten.renorm_,
+            aten.replication_pad2d,
+            aten.rot90,
+            aten.rrelu_with_noise,
+            aten.rrelu_with_noise_,
+            aten.rsub.Scalar,
+            aten.rsub.Tensor,
+            aten._scaled_dot_product_flash_attention.default,
+            aten.select_backward,
+            aten.select_scatter,
+            aten.sgn,
+            aten.sgn_,
+            aten.sigmoid_backward,
+            aten.silu,
+            aten.silu_,
+            aten.silu_backward,
+            aten.sinc,
+            aten.sinc_,
+            aten.slice_backward,
+            aten.smooth_l1_loss,
+            aten.smooth_l1_loss_backward,
+            aten.soft_margin_loss,
+            aten.soft_margin_loss_backward,
+            aten._softmax_backward_data,
+            aten.softplus,
+            aten.softplus_backward,
+            aten.softshrink,
+            aten.special_entr,
+            aten.special_log_ndtr,
+            aten.special_xlog1py,
+            aten.split.Tensor,
+            aten.squeeze.default,
+            aten.squeeze.dim,
+            aten.std,
+            aten.std_mean,
+            aten.stack,
+            aten.sum.default,
+            aten.sum.out,
+            aten.t,
+            aten.tanh_backward,
+            aten.threshold,
+            aten.threshold_,
+            aten.threshold_backward,
+            aten.trace,
+            aten.transpose.int,
+            aten.tril,
+            aten.tril_,
+            aten.triu,
+            aten.triu_,
+            aten.unbind,
+            aten.unfold_backward,
+            aten.unfold_copy,
+            aten._unsafe_index,
+            aten.unsafe_split.Tensor,
+            aten.unsafe_split_with_sizes,
+            aten._unsafe_view,
+            aten.upsample_bilinear2d,
+            aten.upsample_nearest2d_backward,
+            aten.view_as_complex,
+            aten.xlogy,
+            aten.xlogy_,
+            aten.zero,
+            aten.zero_,
+            aten.zeros,
+            aten.zeros_like,
+            aten._weight_norm_interface,
+        ]
+    )

torch/_decomp/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (12.2 kB). View file

torch/_decomp/__pycache__/decompositions.cpython-310.pyc ADDED Viewed

Binary file (102 kB). View file

torch/_decomp/__pycache__/decompositions_for_jvp.cpython-310.pyc ADDED Viewed

Binary file (6.27 kB). View file

torch/_decomp/__pycache__/decompositions_for_rng.cpython-310.pyc ADDED Viewed

Binary file (7.99 kB). View file

torch/_decomp/decompositions.py ADDED Viewed

The diff for this file is too large to render. See raw diff

torch/_decomp/decompositions_for_jvp.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import inspect
+from typing import Callable, Dict, List, Optional, Tuple
+import torch
+import torch._decomp
+from torch import Tensor
+from torch._prims_common.wrappers import _maybe_remove_out_wrapper
+decomposition_table = torch._decomp.decomposition_table
+decomposition_table_for_jvp: Dict[torch._ops.OperatorBase, Callable] = {}
+register_decomposition = torch._decomp.register_decomposition
+aten = torch.ops.aten
+# NOTE: [forward-mode AD decompositions mechanism]
+#
+# The mechanism is in VariableType,
+#   IF any inputs have forward grad
+#      AND there is no forward AD formula implemented
+#      AND the functions is actually differentiable
+#   run the decomposition
+#      See run_jit_decomposition_with_args_for_jvp
+#      We currently use python decompositions that we torchscript.
+#
+# Note that we would be building the backward graph at the decomposed level
+# too, but that is OK, because we would've errored out otherwise anyway.
+#
+# TODO: The mechanism we are using to register decompositions doesn't
+# seem to be exclusively used for jvp. So open question here is whether
+# torch/csrc/jit/runtime/decomposition_registry.cpp is being used for other things.
+# If that is the case, we may go down the decomposition path unexpectedly
+# (and possibly produce an unintelligible error) vs erroring out earlier and
+# printing that the forward AD formula is not implemented.
+#
+# The solution to this may be to have a explicitly white list control when
+# to enable the decomposition.
+def maybe_register_decomposition(op):
+    def decorator(f):
+        try:
+            return register_decomposition(op)(f)
+        except Exception:
+            return f
+    return decorator
+# Functions where we need a special decomposition for jvp but there's another version that
+# should be used more generally (ex. for jvp we need to recompute the mean and variance for
+# the backwards of a normalization function. Without jvp, it should use the saved value)
+decomposition_table_for_jvp = {}
+def register_decomposition_for_jvp(fn):
+    return register_decomposition(fn, registry=decomposition_table_for_jvp)
+def _register_jit_decomposition_for_jvp(decomp, use_python=False):
+    if decomp in decomposition_table_for_jvp:
+        decomposition_table_used = decomposition_table_for_jvp
+    elif decomp in decomposition_table:
+        decomposition_table_used = decomposition_table
+    else:
+        raise RuntimeError(f"could not find decomposition for {decomp}")
+    decomp_fn = decomposition_table_used[decomp]
+    # `out_wrapper` extends a decompositions signature with
+    # an `out` parameter. However jit will use the unwrapped function's
+    # signature instead so we need to unwrap here to prevent an error
+    decomp_fn = _maybe_remove_out_wrapper(decomp_fn)
+    if use_python:
+        decomp_fn = torch.jit.ignore(decomp_fn)
+        sig = inspect.signature(decomp_fn)
+        # Create a string wrapping the function from the signature
+        # example output:
+        # def wrapped_decomp(x: torch.Tensor, y: int, z: int):
+        #   return decomp_fn(x, y, z)
+        # Thanks copilot!
+        def get_function_def(sig):
+            param_def = [f"{param_str}" for param_str in sig.parameters.values()]
+            param_use = [f"{param_str}" for param_str in sig.parameters.keys()]
+            return f"def wrapped_decomp({', '.join(param_def)}):\n  return decomp_fn({', '.join(param_use)})\n"
+        f_str = get_function_def(sig)
+        graph = torch.jit.CompilationUnit(f_str).wrapped_decomp.graph
+    else:
+        graph = torch.jit.script(decomp_fn).graph
+    torch.jit._register_decomposition(decomp, graph)
+# The only decompositions here are temporary or hacks for the purposes of jvp
+# TODO: do these also belong here?
+@maybe_register_decomposition(aten.trace.default)
+def trace(self: Tensor) -> Tensor:
+    return torch.sum(torch.diag(self))
+@maybe_register_decomposition(aten.log_sigmoid_forward.default)
+def log_sigmoid_forward(self: Tensor) -> Tuple[Tensor, Tensor]:
+    min = torch.minimum(self.new_zeros(()), self)
+    z = torch.exp(-torch.abs(self))
+    if self.is_cuda:
+        buffer = self.new_zeros((0,))
+    else:
+        buffer = z
+    return min - torch.log1p(z), buffer
+def recompute_mean_var(
+    input: Tensor, rstd: Tensor, inner_dim_indices: List[int], keepdim: bool
+):
+    # for most norm decompositions, it will be the same as the core version except for here.
+    # We recompute the mean and variance so that they track gradients through input
+    mean = torch.mean(input, dim=inner_dim_indices, keepdim=keepdim)
+    var = torch.var(input, dim=inner_dim_indices, unbiased=False, keepdim=keepdim)
+    eps = torch.pow(1 / rstd, 2) - var  # this makes me so sad inside
+    eps = eps.detach()
+    rstd = 1 / torch.sqrt(var + eps)
+    return mean, rstd
+@register_decomposition_for_jvp(aten.native_layer_norm_backward)
+def native_layer_norm_backward(
+    grad_out: Tensor,
+    input: Tensor,
+    normalized_shape: List[int],
+    mean: Tensor,
+    rstd: Tensor,
+    weight: Optional[Tensor],
+    bias: Optional[Tensor],
+    output_mask: List[bool],
+) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor]]:
+    input_shape = input.shape
+    input_ndim = input.dim()
+    axis = input_ndim - len(normalized_shape)
+    inner_dims = input_shape[axis:]
+    outer_dims = input_shape[:axis]
+    inner_dim_indices = list(range(axis, input_ndim))
+    outer_dim_indices = list(range(0, axis))
+    N = 1
+    for i in inner_dims:
+        N *= i
+    M = 1
+    for i in outer_dims:
+        M *= i
+    if M <= 0 or N <= 0:
+        return (
+            input.new_zeros(input_shape),
+            input.new_zeros(input_shape[axis:]),
+            input.new_zeros(input_shape[axis:]),
+        )
+    mean_, rstd_ = recompute_mean_var(input, rstd, inner_dim_indices, keepdim=True)
+    x_hat = (input - mean_) * rstd_
+    if weight is not None:
+        grad_x_hat = grad_out * weight
+    else:
+        grad_x_hat = grad_out
+    a = grad_x_hat * N
+    b = torch.sum(grad_x_hat, inner_dim_indices, True)
+    c1 = torch.mul(grad_x_hat, x_hat)
+    c2 = torch.sum(c1, inner_dim_indices, True)
+    c3 = torch.mul(x_hat, c2)
+    inner = a - b - c3
+    if output_mask[0]:
+        d_input: Optional[Tensor] = (rstd_ / N) * inner
+    else:
+        d_input = torch.zeros_like(input)  # should be None but doesn't work with vjp
+    if output_mask[1] and weight is not None:
+        if len(outer_dim_indices) > 0:
+            d_weight: Optional[Tensor] = torch.sum(
+                grad_out * x_hat, outer_dim_indices, False
+            )
+        else:
+            d_weight = grad_out * x_hat
+    elif weight is not None:
+        d_weight = torch.zeros_like(weight)  # should be None but doesn't work with vjp
+    else:
+        d_weight = torch.zeros(())  # should be None but doesn't work with vjp
+    if output_mask[2] and bias is not None:
+        if len(outer_dim_indices) > 0:
+            d_bias: Optional[Tensor] = torch.sum(grad_out, outer_dim_indices, False)
+        else:
+            d_bias = grad_out.clone()
+    elif bias is not None:
+        d_bias = torch.zeros_like(bias)  # should be None but doesn't work with vjp
+    else:
+        d_bias = torch.zeros(())  # should be None but doesn't work with vjp
+    return (d_input, d_weight, d_bias)
+def prod(x: List[int]):
+    r = 1
+    for i in x:
+        r *= i
+    return r
+@register_decomposition_for_jvp(aten.native_batch_norm_backward)
+def native_batch_norm_backward(
+    grad_out: Tensor,
+    input: Tensor,
+    weight: Optional[Tensor],
+    running_mean: Optional[Tensor],
+    running_var: Optional[Tensor],
+    save_mean: Optional[Tensor],
+    save_invstd: Optional[Tensor],
+    train: bool,
+    eps: float,
+    output_mask: List[bool],
+) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]:
+    input_shape = input.shape
+    input_rank = input.dim()
+    assert input_rank >= 2, "rank of the input must be at least 2"
+    axis = 1
+    num_features = prod(input_shape) / input_shape[axis]  # type: ignore[arg-type]
+    mean = save_mean
+    invstd = save_invstd
+    if train:
+        assert (
+            save_mean is not None and save_invstd is not None
+        ), "when train=True, save_mean and save_invstd are required"
+        reduciton_dims = [0] + list(range(2, input.dim()))
+        assert invstd is not None  # for typing
+        mean, invstd = recompute_mean_var(input, invstd, reduciton_dims, keepdim=False)
+    else:
+        assert running_mean is not None and running_var is not None
+        mean = running_mean
+        invstd = torch.rsqrt(running_var + eps)
+    assert invstd is not None and mean is not None
+    broadcast_mask = [1] * input_rank
+    broadcast_mask[axis] = input_shape[axis]
+    reduction_axes: List[int] = []
+    for i in range(input_rank):
+        if i != axis:
+            reduction_axes.append(i)
+    mean = torch.reshape(mean, broadcast_mask)
+    norm = 1.0 / num_features
+    grad_output_sum = torch.sum(grad_out, reduction_axes)
+    dot_p = torch.sum(grad_out * (input - mean), reduction_axes)
+    grad_mean = torch.reshape(grad_output_sum * norm, broadcast_mask)
+    proj_scale = torch.reshape(torch.mul(dot_p * norm, invstd * invstd), broadcast_mask)
+    if weight is None:
+        grad_scale = torch.reshape(invstd, broadcast_mask) * 1.0
+    else:
+        grad_scale = torch.reshape(invstd * weight, broadcast_mask)
+    if train:
+        proj = (input - mean) * proj_scale
+        grad_input = ((grad_out - proj) - grad_mean) * grad_scale
+    else:
+        grad_input = grad_out * grad_scale
+    if output_mask[1]:
+        grad_weight = dot_p * invstd
+    elif weight is not None:
+        grad_weight = torch.zeros_like(
+            weight
+        )  # should be None but doesn't work with vjp
+    else:
+        grad_weight = torch.zeros(())  # should be None but doesn't work with vjp
+    if output_mask[2]:
+        grad_bias = grad_output_sum
+    else:
+        grad_bias = torch.zeros_like(
+            grad_output_sum
+        )  # should be None but doesn't work with vjp
+    return (grad_input, grad_weight, grad_bias)
+_register_jit_decomposition_for_jvp(torch.ops.aten.trace.default, use_python=True)
+_register_jit_decomposition_for_jvp(torch.ops.aten.nll_loss_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.nll_loss2d_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten._log_softmax_backward_data.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten._softmax_backward_data.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.log_sigmoid_forward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.native_layer_norm_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.native_batch_norm_backward.default)
+_register_jit_decomposition_for_jvp(torch.ops.aten.cudnn_batch_norm_backward.default)

torch/_decomp/decompositions_for_rng.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import functools
+from collections import defaultdict
+from typing import Callable, Dict
+import torch
+import torch._decomp as decomp
+from torch._decomp import get_decompositions
+from torch._ops import OpOverload
+aten = torch.ops.aten
+rng_decompositions: Dict[str, Dict[OpOverload, Callable]] = defaultdict(dict)
+def register_rng_decomposition(aten_op):
+    return decomp.register_decomposition(aten_op, rng_decompositions)
+def throw_on_non_cuda(device):
+    raise RuntimeError(
+        f"You are trying to functionalize a {device.type} RNG operator but {device.type} does not "
+        f"use Philox/counter-based RNG. Therefore, functionalizing a {device.type} RNG operator is "
+        "not supported. We are discussing the possibility of a Philox-based RNG implementation for CPU."
+    )
+# TODO - We have to register many more distributions here, and also higher level
+# ops like dropout which have fused implementation and can hide the rand inside.
+@register_rng_decomposition(aten.rand)
+def rand(shape, dtype=None, layout=torch.strided, device=None, pin_memory=False):
+    if device and device.type != "cuda":
+        throw_on_non_cuda(device)
+    seed, offset = PhiloxStateTracker.get_state_as_tuple()
+    dtype = dtype or torch.float32
+    out, offset_jump = torch.ops.rngprims.philox_rand(
+        shape, seed, offset, None, device, dtype
+    )
+    PhiloxStateTracker.advance_offset(offset_jump)
+    return out
+@register_rng_decomposition(aten.rand_like)
+def rand_like(
+    x: torch.Tensor,
+    dtype=None,
+    layout=None,
+    device=None,
+    pin_memory=False,
+    memory_format=torch.preserve_format,
+):
+    device = device or x.device
+    if device.type != "cuda":
+        throw_on_non_cuda(device)
+    dtype = dtype or x.dtype
+    seed, offset = PhiloxStateTracker.get_state_as_tuple()
+    out, offset_jump = torch.ops.rngprims.philox_rand(
+        x.shape, seed, offset, None, device, dtype
+    )
+    PhiloxStateTracker.advance_offset(offset_jump)
+    return out
+class PhiloxState:
+    """
+    Represents a PhiloxRngState - (seed, offset) where offset = base_offset +
+    relative_offset. seed and base_offset basically point to the rng state just
+    before tracing starts. relative offset tracks the totally consumed offset at
+    trace time.
+    """
+    def __init__(self):
+        self.reset()
+    def reset(self):
+        self.seed = torch.tensor(())
+        self.base_offset = torch.tensor(())
+        self.relative_offset = 0
+        self.offset_advanced_alteast_once = False
+    def validate_state(self):
+        assert self.seed.numel() != 0 and self.base_offset.numel() != 0
+    def advance_offset(self, consumed_offset):
+        self.offset_advanced_alteast_once = True
+        self.relative_offset = self.relative_offset + consumed_offset
+    def set_state(self, seed, base_offset, relative_offset=0):
+        self.seed = seed
+        self.base_offset = base_offset
+        self.relative_offset = relative_offset
+    def get_state_as_tuple(self):
+        self.validate_state()
+        return (self.seed, self.base_offset + self.relative_offset)
+    def get_state_as_tensor(self):
+        # Only needed because we override get_rng_state.
+        self.validate_state()
+        return torch.stack([self.seed, self.base_offset + self.relative_offset])
+    def set_state_from_tensor(self, state):
+        # Only needed because we override set_rng_state.
+        self.seed, self.base_offset = torch.unbind(state)
+        self.relative_offset = 0
+class PhiloxStateTracker:
+    """
+    Singleton class to track the philox rng state during AOT Autograd tracing.
+    For each aot tracing instance, AOT Autograd resets this tracker and keeps
+    track of both forward and backward offsets. At runtime, we only care about
+    the total consumed forward and backward offsets. For dynamic shapes, these
+    offsets are a function of input shapes. Therefore, the AOT generated graphs
+    have additional outputs that compute total consumed forward and backward
+    offsets.
+    """
+    running_state: PhiloxState
+    fwd_state: PhiloxState
+    bwd_state: PhiloxState
+    def __enter__(self):
+        PhiloxStateTracker.reset()
+        return self
+    def __exit__(self, exc_type, exc_cal, exc_tb):
+        PhiloxStateTracker.reset()
+    @classmethod
+    def reset(cls):
+        cls.running_state = PhiloxState()
+        cls.fwd_state = PhiloxState()
+        cls.bwd_state = PhiloxState()
+    @classmethod
+    def mark_beginning_of_forward(cls):
+        # Tells the tracker to use fwd_state as the running state
+        cls.running_state = cls.fwd_state
+    @classmethod
+    def mark_beginning_of_backward(cls):
+        # Tells the tracker to use bwd_state as the running state
+        cls.running_state = cls.bwd_state
+    @classmethod
+    def record_state(cls, seed, offset, mode):
+        # Records the seed and offset tensors. These tensors are used to invoke
+        # the philox_rand functional primitives.
+        if mode == "forward":
+            cls.fwd_state.set_state(seed, offset)
+            cls.mark_beginning_of_forward()
+        else:
+            assert mode == "backward"
+            cls.bwd_state.set_state(seed, offset)
+    @classmethod
+    def get_state_as_tensor(cls):
+        # The only reason this exists is because we override get_rng_state and
+        # set_rng_state during tracing. get_rng_state expects a tensor output,
+        # so return (seed, offset) tuple upset other parts of the program like
+        # ctx.saved_tensors.
+        # A bad consequence is that if user saves and restores rng state, we
+        # have little bit of ugliness in the generated code, where we first
+        # concat the (seed, offset) to create a tensor for get_rng_state, and
+        # then split it back to get (seed, offset) tuple in set_rng_state.
+        # TODO: Investigate if there is be a better way to wrap the tuple in a
+        # false Tensor object, and then desugar it later on.
+        return cls.running_state.get_state_as_tensor()
+    @classmethod
+    def get_state_as_tuple(cls):
+        return cls.running_state.get_state_as_tuple()
+    @classmethod
+    def set_state_from_tensor(cls, x):
+        # This is only needed because we override set_rng_state. Look at the
+        # comment in get_state_from_tensor method.
+        cls.running_state.set_state_from_tensor(x)
+    @classmethod
+    def advance_offset(cls, consumed_offset):
+        cls.running_state.advance_offset(consumed_offset)
+    @classmethod
+    def get_current_relative_offset(cls):
+        return cls.running_state.relative_offset
+    @staticmethod
+    def multiple_of_4(offset):
+        # torch cuda rng state offset must be a multiple of 4. For inductor, as
+        # we sum up all the numel, the result might not be a multiple of 4. This
+        # method achieves that.
+        return (offset + 3) // 4 * 4
+    @classmethod
+    def get_updated_fwd_offset(cls):
+        # Short circuit if no rand ops were observed
+        if not cls.fwd_state.offset_advanced_alteast_once:
+            return cls.fwd_state.base_offset
+        return cls.multiple_of_4(
+            cls.fwd_state.base_offset + cls.fwd_state.relative_offset
+        )
+    @classmethod
+    def get_updated_bwd_offset(cls):
+        # Short circuit if no rand ops were observed
+        if not cls.bwd_state.offset_advanced_alteast_once:
+            return cls.bwd_state.base_offset
+        return cls.multiple_of_4(
+            cls.bwd_state.base_offset + cls.bwd_state.relative_offset
+        )
+# Adding more decompositions which eventually use rand_like inside decomps.
+# Adding these in rng_decompositions ensures the functionalization of rand_like
+# ops used in these decomps. The list is copied from inductor codebase, which
+# uses it for similar purpose.
+#
+# Caution - These decomps do not have same accuracy as that of eager. However,
+# we can't just disable them with a config flag like fallback_random, because
+# for functionalization of rng ops, we have to decompose these ops.
+extra_random_decomps = get_decompositions(
+    [
+        aten.cauchy,
+        aten.cauchy_,
+        aten.exponential,
+        aten.exponential_,
+        aten.geometric,
+        aten.geometric_,
+        aten.native_dropout,
+        aten.normal,
+        aten.normal_,
+        aten.normal_functional,
+        aten.log_normal,
+        aten.log_normal_,
+        aten.rrelu_with_noise,
+        aten.rrelu_with_noise_,
+        aten.uniform_,
+    ]
+)
+register_extra_random_decomp = functools.partial(
+    decomp.register_decomposition, registry=extra_random_decomps
+)
+@register_extra_random_decomp([aten.bernoulli_])
+def bernoulli_(self, p=0.5):
+    if self.device == torch.device("cpu"):
+        return NotImplemented
+    return self.copy_(torch.rand_like(self, dtype=torch.float32) < p)
+@register_extra_random_decomp([aten.bernoulli.p])
+def bernoulli_p(self, p=0.5, *, generator=None):
+    if self.device == torch.device("cpu"):
+        return NotImplemented
+    assert generator is None
+    return torch.rand_like(self, dtype=torch.float32) < p
+rng_decompositions.update(extra_random_decomps)  # type: ignore[arg-type]

torch/_deploy.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import io
+import torch
+from torch.package import Importer, OrderedImporter, PackageImporter, sys_importer
+from torch.package._package_pickler import create_pickler
+from torch.package._package_unpickler import PackageUnpickler
+from torch.serialization import _maybe_decode_ascii
+def _save_storages(importer, obj):
+    serialized_storages = []
+    serialized_dtypes = []
+    importer = importer if isinstance(importer, torch.package.PackageImporter) else None
+    importers: Importer
+    if importer is not None:
+        importers = OrderedImporter(importer, sys_importer)
+    else:
+        importers = sys_importer
+    def persistent_id(obj):
+        if torch.is_storage(obj) or isinstance(obj, torch.storage.TypedStorage):
+            if isinstance(obj, torch.storage.TypedStorage):
+                # TODO: Once we decide to break serialization FC, we can
+                # remove this case
+                storage = obj._untyped_storage
+                dtype = obj.dtype
+            else:
+                storage = obj
+                dtype = torch.uint8
+            serialized_storages.append(obj)
+            serialized_dtypes.append(dtype)
+            return ("storage", len(serialized_storages) - 1)
+        if hasattr(obj, "__reduce_deploy__"):
+            if _serialized_reduces.get(id(obj)) is None:
+                _serialized_reduces[id(obj)] = (
+                    "reduce_deploy",
+                    id(obj),
+                    *obj.__reduce_deploy__(importers),
+                )
+            return _serialized_reduces[id(obj)]
+        return None
+    # Write the pickle data for `obj`
+    data_buf = io.BytesIO()
+    pickler = create_pickler(data_buf, importers)
+    pickler.persistent_id = persistent_id
+    pickler.dump(obj)
+    data_value = data_buf.getvalue()
+    return (
+        data_value,
+        serialized_storages,
+        serialized_dtypes,
+        importer.zip_reader if importer else None,
+    )
+def _load_storages(id, zip_reader, obj_bytes, serialized_storages, serialized_dtypes):
+    def persistent_load(saved_id):
+        assert isinstance(saved_id, tuple)
+        typename = _maybe_decode_ascii(saved_id[0])
+        data = saved_id[1:]
+        if typename == "storage":
+            # TODO: Once we decide to break serialization FC, we can
+            # stop wrapping with TypedStorage
+            storage = serialized_storages[data[0]]
+            dtype = serialized_dtypes[data[0]]
+            return torch.storage.TypedStorage(
+                wrap_storage=storage.untyped(), dtype=dtype
+            )
+        if typename == "reduce_deploy":
+            reduce_id, func, args = data
+            if reduce_id not in _loaded_reduces:
+                _loaded_reduces[reduce_id] = func(_raw_packages[zip_reader], *args)
+            return _loaded_reduces[reduce_id]
+        return None
+    importer: Importer
+    if zip_reader is not None:
+        importer = OrderedImporter(_get_package(zip_reader), sys_importer)
+    else:
+        importer = sys_importer
+    unpickler = PackageUnpickler(importer, io.BytesIO(obj_bytes))
+    unpickler.persistent_load = persistent_load  # type: ignore[assignment]
+    result = _deploy_objects[id] = unpickler.load()
+    return result
+def _get_package(zip_reader):
+    if zip_reader not in _raw_packages:
+        _raw_packages[zip_reader] = PackageImporter(zip_reader)
+    return _raw_packages[zip_reader]
+_raw_packages: dict = {}
+_deploy_objects: dict = {}
+_serialized_reduces: dict = {}
+_loaded_reduces: dict = {}

torch/_dispatch/__init__.py ADDED Viewed

File without changes