PyTorch
Loading...
Searching...
No Matches
CUDAStream.h
Go to the documentation of this file.
1#pragma once
2
3#include <cstdint>
4#include <utility>
5
6#include <cuda_runtime_api.h>
7
8#include <c10/core/DeviceGuard.h>
9#include <c10/core/Stream.h>
10#include <c10/cuda/CUDAFunctions.h>
11#include <c10/util/Exception.h>
12
13/*
14 * Stream pool note.
15 *
16 * A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams
17 * are backed by cuStreams, but they use several pools to minimize the costs
18 * associated with creating, retaining, and destroying cuStreams.
19 *
20 * There are three pools per device, and a device's pools are lazily created.
21 *
22 * The first pool contains only the default stream. When the default stream
23 * is requested it's returned.
24 *
25 * The second pool is the "low priority" or "default priority" streams. In
26 * HIP builds there is no distinction between streams in this pool and streams
27 * in the third pool (below). There are 32 of these streams per device, and
28 * when a stream is requested one of these streams is returned round-robin.
29 * That is, the first stream requested is at index 0, the second at index 1...
30 * to index 31, then index 0 again.
31 *
32 * This means that if 33 low priority streams are requested, the first and
33 * last streams requested are actually the same stream (under the covers)
34 * and kernels enqueued on them cannot run concurrently.
35 *
36 * The third pool is the "high priority" streams. The third pool acts like
37 * the second pool except the streams are created with a higher priority.
38 *
39 * These pools suggest that stream users should prefer many short-lived streams,
40 * as the cost of acquiring and releasing streams is effectively zero. If
41 * many longer-lived streams are required in performance critical scenarios
42 * then the functionality here may need to be extended to allow, for example,
43 * "reserving" a subset of the pool so that other streams do not accidentally
44 * overlap the performance critical streams.
45 *
46 * Note: although the notion of "current stream for device" is thread local
47 * (every OS thread has a separate current stream, as one might expect),
48 * the stream pool is global across all threads; stream 0 is always stream 0
49 * no matter which thread you use it on. Multiple threads can synchronize
50 * on the same stream. Although the CUDA documentation is not very clear
51 * on the matter, streams are thread safe; e.g., it is safe to enqueue
52 * a kernel on the same stream from two different threads.
53 */
54
55namespace c10 {
56namespace cuda {
57
58// Value object representing a CUDA stream. This is just a wrapper
59// around c10::Stream, but it comes with a little extra CUDA-specific
60// functionality (conversion to cudaStream_t), and a guarantee that
61// the wrapped c10::Stream really is a CUDA stream.
62class C10_CUDA_API CUDAStream {
63 public:
64 enum Unchecked { UNCHECKED };
65
68 explicit CUDAStream(Stream stream) : stream_(stream) {
69 TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);
70 }
71
75 explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}
76
77 bool operator==(const CUDAStream& other) const noexcept {
78 return unwrap() == other.unwrap();
79 }
80
81 bool operator!=(const CUDAStream& other) const noexcept {
82 return unwrap() != other.unwrap();
83 }
84
86 operator cudaStream_t() const {
87 return stream();
88 }
89
92 operator Stream() const {
93 return unwrap();
94 }
95
98 return DeviceType::CUDA;
99 }
100
103 return stream_.device_index();
104 }
105
108 Device device() const {
109 return Device(DeviceType::CUDA, device_index());
110 }
111
113 StreamId id() const {
114 return stream_.id();
115 }
116
117 bool query() const {
118 DeviceGuard guard{stream_.device()};
119 cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));
120
121 if (err == cudaSuccess) {
122 return true;
123 } else if (err != cudaErrorNotReady) {
124 C10_CUDA_CHECK(err);
125 } else {
126 // ignore and clear the error if not ready
127 (void)cudaGetLastError();
128 }
129
130 return false;
131 }
132
133 void synchronize() const {
134 DeviceGuard guard{stream_.device()};
135 c10::cuda::stream_synchronize(stream());
136 }
137
138 int priority() const {
139 DeviceGuard guard{stream_.device()};
140 int priority = 0;
141 C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));
142 return priority;
143 }
144
146 cudaStream_t stream() const;
147
149 Stream unwrap() const {
150 return stream_;
151 }
152
162 struct c10::StreamData3 pack3() const {
163 return stream_.pack3();
164 }
165
166 // Unpack a CUDAStream from the 3 fields generated by pack().
168 StreamId stream_id,
169 DeviceIndex device_index,
170 DeviceType device_type) {
171 return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));
172 }
173
174 static std::tuple<int, int> priority_range() {
175 // Note: this returns the range of priority **supported by PyTorch**, not
176 // the range of priority **supported by CUDA**. The former is a subset of
177 // the latter. Currently PyTorch only supports 0 and -1, which are "low" and
178 // "high" priority.
179 int least_priority, greatest_priority;
180 C10_CUDA_CHECK(
181 cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
183 least_priority >= 0, "Unexpected CUDA stream priority range");
185 greatest_priority <= -1, "Unexpected CUDA stream priority range");
186 return std::make_tuple(0, -1);
187 }
188
189 // Deleted for now; use CUDAEvent::block instead
190 // void synchronize_with(const CUDAEvent& event) const;
191
192 private:
193 Stream stream_;
194};
195
206C10_API CUDAStream
207getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);
208
216C10_API CUDAStream
217getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);
218
225C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);
226
234C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);
235
246C10_API void setCurrentCUDAStream(CUDAStream stream);
247
248C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);
249
250} // namespace cuda
251} // namespace c10
252
253namespace std {
254template <>
255struct hash<c10::cuda::CUDAStream> {
256 size_t operator()(c10::cuda::CUDAStream s) const noexcept {
257 return std::hash<c10::Stream>{}(s.unwrap());
258 }
259};
260} // namespace std
#define TORCH_INTERNAL_ASSERT(cond,...)
Definition: Exception.h:377
#define TORCH_CHECK(cond,...)
Definition: Exception.h:505
Definition: CUDAStream.h:62
Stream unwrap() const
Explicit conversion to Stream.
Definition: CUDAStream.h:149
cudaStream_t stream() const
Explicit conversion to cudaStream_t.
DeviceType device_type() const
Used to avoid baking in device type explicitly to Python-side API.
Definition: CUDAStream.h:97
bool query() const
Definition: CUDAStream.h:117
StreamId id() const
Return the stream ID corresponding to this particular stream.
Definition: CUDAStream.h:113
int priority() const
Definition: CUDAStream.h:138
CUDAStream(Unchecked, Stream stream)
Construct a CUDAStream from a Stream with no error checking.
Definition: CUDAStream.h:75
Device device() const
Get the full Device that this stream is associated with.
Definition: CUDAStream.h:108
Unchecked
Definition: CUDAStream.h:64
static std::tuple< int, int > priority_range()
Definition: CUDAStream.h:174
static CUDAStream unpack3(StreamId stream_id, DeviceIndex device_index, DeviceType device_type)
Definition: CUDAStream.h:167
void synchronize() const
Definition: CUDAStream.h:133
DeviceIndex device_index() const
Get the CUDA device index that this stream is associated with.
Definition: CUDAStream.h:102
CUDAStream(Stream stream)
Construct a CUDAStream from a Stream.
Definition: CUDAStream.h:68
bool operator!=(const CUDAStream &other) const noexcept
Definition: CUDAStream.h:81
bool operator==(const CUDAStream &other) const noexcept
Definition: CUDAStream.h:77
CUDAStream getDefaultCUDAStream(DeviceIndex device_index=-1)
Get the default CUDA stream, for the passed CUDA device, or for the current device if no device index...
CUDAStream getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index)
Get a CUDAStream from a externally allocated one.
CUDAStream getStreamFromPool(const bool isHighPriority=false, DeviceIndex device=-1)
Get a new stream from the CUDA stream pool.
void setCurrentCUDAStream(CUDAStream stream)
Set the current stream on the device of the passed in stream to be the passed in stream.
std::ostream & operator<<(std::ostream &stream, const CUDAStream &s)
CUDAStream getCurrentCUDAStream(DeviceIndex device_index=-1)
Get the current CUDA stream, for the passed CUDA device, or for the current device if no device index...
Definition: ivalue.h:27
int8_t DeviceIndex
An index representing a specific device; e.g., the 1 in GPU 1.
Definition: Device.h:18
DeviceType
Definition: DeviceType.h:33
Definition: Device.h:181
Represents a a compute device on which a tensor is located.
Definition: Device.h:30