Add files using upload-large-folder tool

d1d4335 verified 3 months ago

9.9 kB

	#pragma once

	#include <cuda_runtime_api.h>

	#include <c10/core/DeviceGuard.h>
	#include <c10/core/Stream.h>
	#include <c10/cuda/CUDAFunctions.h>
	#include <c10/util/Exception.h>

	/*
	* Stream pool note.
	*
	* A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
	* are backed by cuStreams, but they use several pools to minimize the costs
	* associated with creating, retaining, and destroying cuStreams.
	*
	* There are three pools per device, and a device's pools are lazily created.
	*
	* The first pool contains only the default stream. When the default stream
	* is requested it's returned.
	*
	* The second pool is the "low priority" or "default priority" streams. In
	* HIP builds there is no distinction between streams in this pool and streams
	* in the third pool (below). There are 32 of these streams per device, and
	* when a stream is requested one of these streams is returned round-robin.
	* That is, the first stream requested is at index 0, the second at index 1...
	* to index 31, then index 0 again.
	*
	* This means that if 33 low priority streams are requested, the first and
	* last streams requested are actually the same stream (under the covers)
	* and kernels enqueued on them cannot run concurrently.
	*
	* The third pool is the "high priority" streams. The third pool acts like
	* the second pool except the streams are created with a higher priority.
	*
	* These pools suggest that stream users should prefer many short-lived streams,
	* as the cost of acquiring and releasing streams is effectively zero. If
	* many longer-lived streams are required in performance critical scenarios
	* then the functionality here may need to be extended to allow, for example,
	* "reserving" a subset of the pool so that other streams do not accidentally
	* overlap the performance critical streams.
	*
	* Note: although the notion of "current stream for device" is thread local
	* (every OS thread has a separate current stream, as one might expect),
	* the stream pool is global across all threads; stream 0 is always stream 0
	* no matter which thread you use it on. Multiple threads can synchronize
	* on the same stream. Although the CUDA documentation is not very clear
	* on the matter, streams are thread safe; e.g., it is safe to enqueue
	* a kernel on the same stream from two different threads.
	*/

	namespace c10::cuda {

	static constexpr int max_compile_time_stream_priorities = 4;

	// Value object representing a CUDA stream. This is just a wrapper
	// around c10::Stream, but it comes with a little extra CUDA-specific
	// functionality (conversion to cudaStream_t), and a guarantee that
	// the wrapped c10::Stream really is a CUDA stream.
	class C10_CUDA_API CUDAStream {
	public:
	enum Unchecked { UNCHECKED };

	/// Construct a CUDAStream from a Stream. This construction is checked,
	/// and will raise an error if the Stream is not, in fact, a CUDA stream.
	explicit CUDAStream(Stream stream) : stream_(stream) {
	TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
	}

	/// Construct a CUDAStream from a Stream with no error checking.
	/// This constructor uses the "named" constructor idiom, and can
	/// be invoked as: CUDAStream(CUDAStream::UNCHECKED, stream)
	explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}

	bool operator==(const CUDAStream& other) const noexcept {
	return unwrap() == other.unwrap();
	}

	bool operator!=(const CUDAStream& other) const noexcept {
	return unwrap() != other.unwrap();
	}

	/// Implicit conversion to cudaStream_t.
	operator cudaStream_t() const {
	return stream();
	}

	/// Implicit conversion to Stream (a.k.a., forget that the stream is a
	/// CUDA stream).
	operator Stream() const {
	return unwrap();
	}

	/// Used to avoid baking in device type explicitly to Python-side API.
	DeviceType device_type() const {
	return DeviceType::CUDA;
	}

	/// Get the CUDA device index that this stream is associated with.
	DeviceIndex device_index() const {
	return stream_.device_index();
	}

	/// Get the full Device that this stream is associated with. The Device
	/// is guaranteed to be a CUDA device.
	Device device() const {
	return Device(DeviceType::CUDA, device_index());
	}

	/// Return the stream ID corresponding to this particular stream.
	StreamId id() const {
	return stream_.id();
	}

	bool query() const {
	DeviceGuard guard{stream_.device()};
	cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));

	if (err == cudaSuccess) {
	return true;
	} else if (err != cudaErrorNotReady) {
	C10_CUDA_CHECK(err);
	} else {
	// ignore and clear the error if not ready
	(void)cudaGetLastError();
	}

	return false;
	}

	void synchronize() const {
	DeviceGuard guard{stream_.device()};
	c10::cuda::stream_synchronize(stream());
	}

	int priority() const {
	DeviceGuard guard{stream_.device()};
	int priority = 0;
	C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
	return priority;
	}

	/// Explicit conversion to cudaStream_t.
	cudaStream_t stream() const;

	/// Explicit conversion to Stream.
	Stream unwrap() const {
	return stream_;
	}

	/// Reversibly pack a CUDAStream into a struct representation.
	/// Previously the stream's data was packed into a single int64_t,
	/// as it was assumed the fields would not require more than
	/// 64 bits of storage in total.
	/// See https://github.com/pytorch/pytorch/issues/75854
	/// for more information regarding newer platforms that may violate
	/// this assumption.
	///
	/// The CUDAStream can be unpacked using unpack().
	struct c10::StreamData3 pack3() const {
	return stream_.pack3();
	}

	// Unpack a CUDAStream from the 3 fields generated by pack().
	static CUDAStream unpack3(
	StreamId stream_id,
	DeviceIndex device_index,
	DeviceType device_type) {
	return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
	}

	static std::tuple<int, int> priority_range() {
	// Note: this returns the range of priority supported by PyTorch, not
	// the range of priority supported by CUDA. The former is a subset of
	// the latter.
	int least_priority = 0, greatest_priority = 0;
	C10_CUDA_CHECK(
	cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
	#ifdef USE_ROCM
	// See Note [HIP stream priorities]
	TORCH_INTERNAL_ASSERT(
	least_priority == 1, "Unexpected HIP stream priority range");
	least_priority = 0;
	#else
	TORCH_INTERNAL_ASSERT(
	least_priority == 0, "Unexpected CUDA stream priority range");
	#endif
	TORCH_INTERNAL_ASSERT(
	greatest_priority <= -1, "Unexpected CUDA stream priority range");
	greatest_priority = std::max(
	-c10::cuda::max_compile_time_stream_priorities + 1, greatest_priority);
	return std::make_tuple(least_priority, greatest_priority);
	}

	// Deleted for now; use CUDAEvent::block instead
	// void synchronize_with(const CUDAEvent& event) const;

	private:
	Stream stream_;
	};

	/**
	* Get a new stream from the CUDA stream pool. You can think of this
	* as "creating" a new stream, but no such creation actually happens;
	* instead, streams are preallocated from the pool and returned in a
	* round-robin fashion.
	*
	* You can request a stream from the high priority pool by setting
	* isHighPriority to true, or a stream for a specific device by setting device
	* (defaulting to the current CUDA stream.)
	*/
	C10_API CUDAStream
	getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
	// no default priority to disambiguate overloads
	C10_API CUDAStream
	getStreamFromPool(const int priority, DeviceIndex device = -1);

	/**
	* Get a CUDAStream from a externally allocated one.
	*
	* This is mainly for interoperability with different libraries where we
	* want to operate on a non-torch allocated stream for data exchange or similar
	* purposes
	*/
	C10_API CUDAStream
	getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);

	/**
	* Get the default CUDA stream, for the passed CUDA device, or for the
	* current device if no device index is passed. The default stream is
	* where most computation occurs when you aren't explicitly using
	* streams.
	*/
	C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);

	/**
	* Get the current CUDA stream, for the passed CUDA device, or for the
	* current device if no device index is passed. The current CUDA stream
	* will usually be the default CUDA stream for the device, but it may
	* be different if someone called 'setCurrentCUDAStream' or used 'StreamGuard'
	* or 'CUDAStreamGuard'.
	*/
	C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);

	/**
	* Set the current stream on the device of the passed in stream to be
	* the passed in stream. Yes, you read that right: this function
	* has nothing to do with the current device: it toggles the current
	* stream of the device of the passed stream.
	*
	* Confused? Avoid using this function; prefer using 'CUDAStreamGuard' instead
	* (which will switch both your current device and current stream in the way you
	* expect, and reset it back to its original state afterwards).
	*/
	C10_API void setCurrentCUDAStream(CUDAStream stream);

	C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);

	} // namespace c10::cuda

	namespace std {
	template <>
	struct hash<c10::cuda::CUDAStream> {
	size_t operator()(c10::cuda::CUDAStream s) const noexcept {
	return std::hash<c10::Stream>{}(s.unwrap());
	}
	};
	} // namespace std