import multiprocessing import os import threading from multiprocessing.reduction import ForkingPickler from multiprocessing.util import register_after_fork from typing import Union import torch import torch.utils.hooks from torch._namedtensor_internals import check_serializing_named_tensor try: # Early load resource_sharer to prevent a partially initialized instance # from being inherited in a forked child process. The reduce_storage method # requires this module indirectly through DupFd(). The built-in mp.Queue # class pickles arguments in a background thread which may overlap with the # fork. import multiprocessing.resource_sharer except ImportError: pass class StorageWeakRef: r"""A weak reference to a Storage. The cdata member is a Python number containing the integer representation of the Storage pointer. """ __slots__ = ["cdata", "_free_weak_ref"] def __init__(self, storage): self.cdata = storage._weak_ref() # Save a direct reference to _free_weak_ref because the `torch` module # might be cleared during Python shutdown before this module is cleared. self._free_weak_ref = torch.Storage._free_weak_ref # type: ignore[attr-defined] @classmethod def from_weakref(cls, cdata): instance = cls.__new__(cls) instance.cdata = cdata instance._free_weak_ref = torch.Storage._free_weak_ref # type: ignore[attr-defined] return instance def expired(self): return torch.Storage._expired(self.cdata) # type: ignore[attr-defined] def __del__(self): self._free_weak_ref(self.cdata) def __hash__(self): return self.cdata def __eq__(self, other): if id(self) == id(other): return True return self.cdata == other.cdata class SharedCache(dict): """Dictionary from multiprocessing handles to StorageWeakRef.""" def __init__(self): # free_dead_references() is called if the len exceeds the current # limit. The limit scales with the number of remaining live objects. self.limit = 128 # `fork` inherits lock state, so in case we fork when the lock is held, # we register a function to reset the lock to a new object to avoid # possible deadlocks, following python multiprocessing library design. self._after_fork() register_after_fork(self, SharedCache._after_fork) def _after_fork(self): self.lock = threading.Lock() def get(self, key): with self.lock: return dict.get(self, key) def __setitem__(self, key, storage_ref): with self.lock: dict.__setitem__(self, key, storage_ref) if len(self) > self.limit: self.free_dead_references() def free_dead_references(self): live = 0 for key, storage_ref in list(self.items()): if storage_ref.expired(): del self[key] else: live += 1 self.limit = max(128, live * 2) # mapping from handles to StorageWeakRef objects shared_cache = SharedCache() def rebuild_event(device, handle): return torch.cuda.Event.from_ipc_handle(device, handle) def reduce_event(event): handle = event.ipc_handle() return (rebuild_event, (event.device, handle)) def rebuild_tensor(cls, storage, metadata): storage_offset, size, stride, requires_grad = metadata t = torch._utils._rebuild_tensor(storage, storage_offset, size, stride) if cls == torch.nn.parameter.Parameter: # we have to pass requires_grad into constructor, rather than set it as an # attribute later, because it's an important check for Integer Tensors to # have requires_grad=False (or else they raise an error) t = torch.nn.parameter.Parameter(t, requires_grad=requires_grad) else: t.requires_grad = requires_grad return t def rebuild_cuda_tensor( tensor_cls, tensor_size, tensor_stride, tensor_offset, storage_cls, dtype, storage_device, storage_handle, storage_size_bytes, storage_offset_bytes, requires_grad, ref_counter_handle, ref_counter_offset, event_handle, event_sync_required, ): # If storage_handle is None, storage points to nullptr. if storage_handle is None or storage_size_bytes == 0: storage = storage_cls(0, dtype=dtype, device=storage_device, _internal=True) else: storage = storage_from_cache( storage_cls, (storage_handle, storage_offset_bytes) ) if storage is None: torch.cuda._lazy_init() storage = storage_cls._new_shared_cuda( storage_device, storage_handle, storage_size_bytes, storage_offset_bytes, ref_counter_handle, ref_counter_offset, event_handle, event_sync_required, ) shared_cache[(storage_handle, storage_offset_bytes)] = StorageWeakRef( storage ) else: # We already ref counting this Storage, but producer needs new ref-counters to be released. storage_cls._release_ipc_counter( ref_counter_handle, ref_counter_offset, device=storage_device ) _storage = ( storage if isinstance(storage, torch.UntypedStorage) else storage._untyped_storage ) t = torch._utils._rebuild_tensor( torch.storage.TypedStorage(wrap_storage=_storage, dtype=dtype, _internal=True), tensor_offset, tensor_size, tensor_stride, ) if tensor_cls == torch.nn.parameter.Parameter: # It is crucial for integer tensors to receive # the requires_grad=False as an argument in the constructor t = torch.nn.parameter.Parameter(t, requires_grad=requires_grad) else: t.requires_grad = requires_grad return t def reduce_tensor(tensor): if tensor.requires_grad and not tensor.is_leaf: raise RuntimeError( "Cowardly refusing to serialize non-leaf tensor which requires_grad, " "since autograd does not support crossing process boundaries. " "If you just want to transfer the data, call detach() on the tensor " "before serializing (e.g., putting it on the queue)." ) check_serializing_named_tensor(tensor) torch.utils.hooks.warn_if_has_hooks(tensor) # Note [CUDA IPC and the caching allocator] # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # When you send a CUDA tensor over IPC, you might expect that you will # get out the same storage from the other end. However, the CUDA caching # allocator makes it difficult to preserve this invariant. Consider # the following situation: a tensor of size 0x100 points to offset 0x20 of # a storage at 0xA100 of size 0x100. (For simplicity, all of these # sizes are given in bytes). HOWEVER, with the caching allocator, this storage # might be part of a larger cudaMalloc allocation 0xA000 of size 0x4000. # # When we want to send this CUDA tensor over IPC, we must send the # *entire* cudaMalloc allocation, i.e., the 0xA000 region, not just # the storage 0xA100 (because that is what CUDA supports). So, on the # other end, there simply isn't any way to say, "Wait, you gave me # a bigger region (0xA000) than the one I wanted (0xA100)". # # OK, so if you sent the cudaMalloc allocation, can you just wrap that up as # one storage itself? No, because this cudaMalloc allocation might contain # storages of mixed types: float, bytes, double... If you make the entire # allocation a single storage of a type A, we'll hit an error when constructing # a tensor of type B on the storage. # # cudaIpcMemHandle is an identifier to access the sender cudaMalloc allocation on the # receiver side. However, cudaIpcMemHandles from each device in a given process may # only be opened by one context per device per other process. # If we open and close a memory handle multiples times in a process, CUDA is allowed # to give it a different address; similarly, once we close the memory, we're not # allowed to access it(and the storage/tensor built on top of it), even if it is # still live in the original process. As we cannot make a cudaMalloc allocation # to a single storage in one go, this requires us to cache the device pointer for # each cudaIpcMemHandle on C++ side to reconstruct types of storages, while keep # the old ones alives. # See [https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html] # # This is fine, because all we need to do is to save our position in the allocation, # and reconstruct storage and tensor from it. # 0xA000 -> -------CUDA Allocation------ # | | # | | # | | # | | # 0xA100 -> --------storage1 begin------ # | | # 0xA120 -> --------tensor1 begin ------ # | | # | | # | | # | | # | | # 0xA160 -> --------tensor1 end--------- # | | # | | # | | # 0xA200 -> --------storage1 end-------- # | | # 0xE000 -> --------CUDA allocation----- # # To send tensor1, the following info are required from sender to receiver for # storage recontruction. # 1. cudaIpcMemHandle of 0xA000(which can be mapped to a basePtr in receiver process). # basePtr may not be exactly 0xA000 since it's a different process. # 2. offset(0xA100) of storage1 in the CUDA allocation. # 3. size of storage1(0x100). # # On receiver side: # 1. Get the devPtr of the MemHandle to access the memory, reconstruct a storage # of the same type using (basePtr, offset, size). # 2. we can reconstruct the tensor on top of the reconstructed storage # Tensor(size=0x040, offset=0x020, storage=Storage(data=basePtr+0xA100, size=0x0100)) # # This strategy has a few implications: # # 1. When we serialize a CUDA tensor for IPC, we cannot do it all in one # go (non-compositionally), and this requires to have a global map # memHandle -> devPtr for each process. # # 2. We MUST NOT let the new IPC tensor be resizable. Originally, a resize # of the storage beyond 0x100 would merely have caused us to do a # reallocation. You don't really want to do this, but if you did, # all that would happen is that you would lose IPC sharing. But if # you do this in the new world, we will happily let you write out of # bounds of your "allocation", clobbering unrelated data in the cached # allocator block. BAD! # # By the way, in old versions of PyTorch, we supported this situation # natively using a "storage view", which permitted multiple storages to be # views on each other. But this was the *only* use of storage views, so we # eliminated it so that we could just use tensor views to implement the same # thing. # # TODO: Handle distinguishing between subclass and non-subclass versions of NT better # https://github.com/pytorch/pytorch/issues/110543 from torch.nested._internal.nested_tensor import NestedTensor if tensor.is_nested and not isinstance(tensor, NestedTensor): return reduce_nested_tensor(tensor) if tensor.layout in { torch.sparse_coo, torch.sparse_csr, torch.sparse_bsr, torch.sparse_csc, torch.sparse_bsc, }: return reduce_sparse_tensor(tensor) storage = tensor._typed_storage() if storage._untyped_storage.device.type == "cuda": ( device, handle, storage_size_bytes, storage_offset_bytes, ref_counter_handle, ref_counter_offset, event_handle, event_sync_required, ) = storage._share_cuda_() tensor_offset = tensor.storage_offset() shared_cache[handle] = StorageWeakRef(storage) # _backward_hooks purposely omitted here, see # Note [Don't serialize hooks] return ( rebuild_cuda_tensor, ( type(tensor), tensor.size(), tensor.stride(), tensor_offset, # tensor offset in its storage type(storage), tensor.dtype, device, handle, # identifier which CUDA allocation is the storage in. storage_size_bytes, # size(in bytes) of the storage storage_offset_bytes, # offset(in bytes) of the storage in the CUDA allocation tensor.requires_grad, ref_counter_handle, ref_counter_offset, event_handle, event_sync_required, ), ) # _backward_hooks purposely omitted here, see Note [Don't serialize hooks] metadata = ( tensor.storage_offset(), tensor.size(), tensor.stride(), tensor.requires_grad, ) return (rebuild_tensor, (type(tensor), storage, metadata)) def rebuild_nested_tensor( rebuild_buffer_func, rebuild_buffer_args, rebuild_sizes_func, rebuild_sizes_args, rebuild_strides_func, rebuild_strides_args, rebuild_offsets_func, rebuild_offsets_args, ): buffer = rebuild_buffer_func(*rebuild_buffer_args) sizes = rebuild_sizes_func(*rebuild_sizes_args) strides = rebuild_strides_func(*rebuild_strides_args) offsets = rebuild_offsets_func(*rebuild_offsets_args) return torch._nested_view_from_buffer_copy(buffer, sizes, strides, offsets) def reduce_nested_tensor(nt): rebuild_buffer_func, rebuild_buffer_args = reduce_tensor(nt.values()) rebuild_sizes_func, rebuild_sizes_args = reduce_tensor(nt._nested_tensor_size()) rebuild_strides_func, rebuild_strides_args = reduce_tensor( nt._nested_tensor_strides() ) rebuild_offsets_func, rebuild_offsets_args = reduce_tensor( nt._nested_tensor_storage_offsets() ) return ( rebuild_nested_tensor, ( rebuild_buffer_func, rebuild_buffer_args, rebuild_sizes_func, rebuild_sizes_args, rebuild_strides_func, rebuild_strides_args, rebuild_offsets_func, rebuild_offsets_args, ), ) def rebuild_sparse_coo_tensor( rebuild_indices_func, rebuild_indices_args, rebuild_values_func, rebuild_values_args, shape, is_coalesced, ): indices = rebuild_indices_func(*rebuild_indices_args) values = rebuild_values_func(*rebuild_values_args) return torch.sparse_coo_tensor(indices, values, shape, is_coalesced=is_coalesced) def rebuild_sparse_compressed_tensor( rebuild_compressed_indices_func, rebuild_compressed_indices_args, rebuild_plain_indices_func, rebuild_plain_indices_args, rebuild_values_func, rebuild_values_args, shape, layout, ): compressed_indices = rebuild_compressed_indices_func( *rebuild_compressed_indices_args ) plain_indices = rebuild_plain_indices_func(*rebuild_plain_indices_args) values = rebuild_values_func(*rebuild_values_args) return torch.sparse_compressed_tensor( compressed_indices, plain_indices, values, shape, layout=layout ) def reduce_sparse_tensor(sparse): if sparse.layout is torch.sparse_coo: rebuild_indices_func, rebuild_indices_args = reduce_tensor(sparse._indices()) rebuild_values_func, rebuild_values_args = reduce_tensor(sparse._values()) return ( rebuild_sparse_coo_tensor, ( rebuild_indices_func, rebuild_indices_args, rebuild_values_func, rebuild_values_args, sparse.shape, sparse.is_coalesced(), ), ) else: if sparse.layout in {torch.sparse_csr, torch.sparse_bsr}: compressed_indices = sparse.crow_indices() plain_indices = sparse.col_indices() elif sparse.layout in {torch.sparse_csc, torch.sparse_bsc}: compressed_indices = sparse.ccol_indices() plain_indices = sparse.row_indices() else: raise NotImplementedError(sparse.layout) ( rebuild_compressed_indices_func, rebuild_compressed_indices_args, ) = reduce_tensor(compressed_indices) rebuild_plain_indices_func, rebuild_plain_indices_args = reduce_tensor( plain_indices ) rebuild_values_func, rebuild_values_args = reduce_tensor(sparse.values()) return ( rebuild_sparse_compressed_tensor, ( rebuild_compressed_indices_func, rebuild_compressed_indices_args, rebuild_plain_indices_func, rebuild_plain_indices_args, rebuild_values_func, rebuild_values_args, sparse.shape, sparse.layout, ), ) def fd_id(fd): # Returns a tuple which uniquely identifies a file descriptor. In Mac OS, # this doesn't work with shared memory handles, which is why we don't # support the "file_descriptor" sharing method on that platform. stat = os.fstat(fd) return (stat.st_ino, stat.st_dev) def storage_from_cache(cls, key): storage_ref = shared_cache.get(key) if storage_ref is None: return None return torch.UntypedStorage._new_with_weak_ptr(storage_ref.cdata) def rebuild_storage_fd(cls, df, size): fd = df.detach() try: storage = storage_from_cache(cls, fd_id(fd)) if storage is not None: return storage storage = cls._new_shared_fd_cpu(fd, size) shared_cache[fd_id(fd)] = StorageWeakRef(storage) return storage finally: os.close(fd) def rebuild_storage_filename(cls, manager, handle, size, dtype=None): storage: Union[torch.TypedStorage, torch.UntypedStorage] = storage_from_cache( cls, handle ) if storage is not None: return storage._shared_decref() if dtype is None: storage = torch.UntypedStorage._new_shared_filename_cpu(manager, handle, size) else: byte_size = size * torch._utils._element_size(dtype) untyped_storage: torch.UntypedStorage = ( torch.UntypedStorage._new_shared_filename_cpu(manager, handle, byte_size) ) storage = torch.TypedStorage( wrap_storage=untyped_storage, dtype=dtype, _internal=True ) shared_cache[handle] = StorageWeakRef(storage) return storage._shared_decref() def rebuild_storage_empty(cls): return cls() def rebuild_typed_storage(storage, dtype): return torch.storage.TypedStorage(wrap_storage=storage, dtype=dtype, _internal=True) # Use for torch.storage.TypedStorage def reduce_typed_storage(storage): return (rebuild_typed_storage, (storage._untyped_storage, storage.dtype)) def rebuild_typed_storage_child(storage, storage_type): return storage_type(wrap_storage=storage, _internal=True) # Use for child classes of torch.storage.TypedStorage, like torch.FloatStorage def reduce_typed_storage_child(storage): return (rebuild_typed_storage_child, (storage._untyped_storage, type(storage))) def reduce_storage(storage): from . import get_sharing_strategy if storage.is_cuda: raise RuntimeError( "Cannot pickle CUDA storage; try pickling a CUDA tensor instead" ) elif get_sharing_strategy() == "file_system": metadata = storage._share_filename_cpu_() cache_key = metadata[1] rebuild = rebuild_storage_filename if isinstance(storage, torch.TypedStorage): metadata += (storage.dtype,) storage._shared_incref() elif storage.size() == 0: # This is special cased because Empty tensors # (with size 0) cannot be mmapped. return (rebuild_storage_empty, (type(storage),)) else: fd, size = storage._share_fd_cpu_() df = multiprocessing.reduction.DupFd(fd) cache_key = fd_id(fd) metadata = (df, size) rebuild = rebuild_storage_fd # type: ignore[assignment] shared_cache[cache_key] = StorageWeakRef(storage) return (rebuild, (type(storage),) + metadata) def init_reductions(): ForkingPickler.register(torch.cuda.Event, reduce_event) for t in torch._storage_classes: if t.__name__ == "UntypedStorage": ForkingPickler.register(t, reduce_storage) else: ForkingPickler.register(t, reduce_typed_storage_child) ForkingPickler.register(torch.storage.TypedStorage, reduce_typed_storage) for t in torch._tensor_classes: ForkingPickler.register(t, reduce_tensor) # TODO: Maybe this should be in tensor_classes? :) ForkingPickler.register(torch.Tensor, reduce_tensor) ForkingPickler.register(torch.nn.parameter.Parameter, reduce_tensor)