_static/doxygen_html/_c_u_d_a_stream_8h_source.html

#pragma once


#include <cstdint>

#include <utility>


#include <cuda_runtime_api.h>


#include <c10/core/DeviceGuard.h>

#include <c10/core/Stream.h>

#include <c10/cuda/CUDAFunctions.h>

#include <c10/util/Exception.h>


/*

 * Stream pool note.

 *

 * A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams

 * are backed by cuStreams, but they use several pools to minimize the costs

 * associated with creating, retaining, and destroying cuStreams.

 *

 * There are three pools per device, and a device's pools are lazily created.

 *

 * The first pool contains only the default stream. When the default stream

 * is requested it's returned.

 *

 * The second pool is the "low priority" or "default priority" streams. In

 * HIP builds there is no distinction between streams in this pool and streams

 * in the third pool (below). There are 32 of these streams per device, and

 * when a stream is requested one of these streams is returned round-robin.

 * That is, the first stream requested is at index 0, the second at index 1...

 * to index 31, then index 0 again.

 *

 * This means that if 33 low priority streams are requested, the first and

 * last streams requested are actually the same stream (under the covers)

 * and kernels enqueued on them cannot run concurrently.

 *

 * The third pool is the "high priority" streams. The third pool acts like

 * the second pool except the streams are created with a higher priority.

 *

 * These pools suggest that stream users should prefer many short-lived streams,

 * as the cost of acquiring and releasing streams is effectively zero. If

 * many longer-lived streams are required in performance critical scenarios

 * then the functionality here may need to be extended to allow, for example,

 * "reserving" a subset of the pool so that other streams do not accidentally

 * overlap the performance critical streams.

 *

 * Note: although the notion of "current stream for device" is thread local

 * (every OS thread has a separate current stream, as one might expect),

 * the stream pool is global across all threads; stream 0 is always stream 0

 * no matter which thread you use it on.  Multiple threads can synchronize

 * on the same stream.  Although the CUDA documentation is not very clear

 * on the matter, streams are thread safe; e.g., it is safe to enqueue

 * a kernel on the same stream from two different threads.

 */


namespace c10 {

namespace cuda {


// Value object representing a CUDA stream.  This is just a wrapper

// around c10::Stream, but it comes with a little extra CUDA-specific

// functionality (conversion to cudaStream_t), and a guarantee that

// the wrapped c10::Stream really is a CUDA stream.

class C10_CUDA_API CUDAStream {

 public:

  enum Unchecked { UNCHECKED };


  explicit CUDAStream(Stream stream) : stream_(stream) {

    TORCH_CHECK(stream_.device_type() == DeviceType::CUDA);

  }


  explicit CUDAStream(Unchecked, Stream stream) : stream_(stream) {}


  bool operator==(const CUDAStream& other) const noexcept {

    return unwrap() == other.unwrap();

  }


  bool operator!=(const CUDAStream& other) const noexcept {

    return unwrap() != other.unwrap();

  }


  operator cudaStream_t() const {

    return stream();

  }


  operator Stream() const {

    return unwrap();

  }


  DeviceType device_type() const {

    return DeviceType::CUDA;

  }


  DeviceIndex device_index() const {

    return stream_.device_index();

  }


  Device device() const {

    return Device(DeviceType::CUDA, device_index());

  }


  StreamId id() const {

    return stream_.id();

  }


  bool query() const {

    DeviceGuard guard{stream_.device()};

    cudaError_t err = C10_CUDA_ERROR_HANDLED(cudaStreamQuery(stream()));


    if (err == cudaSuccess) {

      return true;

    } else if (err != cudaErrorNotReady) {

      C10_CUDA_CHECK(err);

    } else {

      // ignore and clear the error if not ready

      (void)cudaGetLastError();

    }


    return false;

  }


  void synchronize() const {

    DeviceGuard guard{stream_.device()};

    c10::cuda::stream_synchronize(stream());

  }


  int priority() const {

    DeviceGuard guard{stream_.device()};

    int priority = 0;

    C10_CUDA_CHECK(cudaStreamGetPriority(stream(), &priority));

    return priority;

  }


  cudaStream_t stream() const;


  Stream unwrap() const {

    return stream_;

  }


  struct c10::StreamData3 pack3() const {

    return stream_.pack3();

  }


  // Unpack a CUDAStream from the 3 fields generated by pack().

  static CUDAStream unpack3(

      StreamId stream_id,

      DeviceIndex device_index,

      DeviceType device_type) {

    return CUDAStream(Stream::unpack3(stream_id, device_index, device_type));

  }


  static std::tuple<int, int> priority_range() {

    // Note: this returns the range of priority **supported by PyTorch**, not

    // the range of priority **supported by CUDA**. The former is a subset of

    // the latter. Currently PyTorch only supports 0 and -1, which are "low" and

    // "high" priority.

    int least_priority, greatest_priority;

    C10_CUDA_CHECK(

        cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));

    TORCH_INTERNAL_ASSERT(

        least_priority >= 0, "Unexpected CUDA stream priority range");

    TORCH_INTERNAL_ASSERT(

        greatest_priority <= -1, "Unexpected CUDA stream priority range");

    return std::make_tuple(0, -1);

  }


  // Deleted for now; use CUDAEvent::block instead

  // void synchronize_with(const CUDAEvent& event) const;


 private:

  Stream stream_;

};


C10_API CUDAStream

getStreamFromPool(const bool isHighPriority = false, DeviceIndex device = -1);


C10_API CUDAStream

getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index);


C10_API CUDAStream getDefaultCUDAStream(DeviceIndex device_index = -1);


C10_API CUDAStream getCurrentCUDAStream(DeviceIndex device_index = -1);


C10_API void setCurrentCUDAStream(CUDAStream stream);


C10_API std::ostream& operator<<(std::ostream& stream, const CUDAStream& s);


} // namespace cuda

} // namespace c10


namespace std {

template <>

struct hash<c10::cuda::CUDAStream> {

  size_t operator()(c10::cuda::CUDAStream s) const noexcept {

    return std::hash<c10::Stream>{}(s.unwrap());

  }

};

} // namespace std

Exception.h

TORCH_INTERNAL_ASSERT
#define TORCH_INTERNAL_ASSERT(cond,...)
Definition: Exception.h:377

TORCH_CHECK
#define TORCH_CHECK(cond,...)
Definition: Exception.h:505

c10::cuda::CUDAStream
Definition: CUDAStream.h:62

c10::cuda::CUDAStream::unwrap
Stream unwrap() const
Explicit conversion to Stream.
Definition: CUDAStream.h:149

c10::cuda::CUDAStream::stream
cudaStream_t stream() const
Explicit conversion to cudaStream_t.

c10::cuda::CUDAStream::device_type
DeviceType device_type() const
Used to avoid baking in device type explicitly to Python-side API.
Definition: CUDAStream.h:97

c10::cuda::CUDAStream::query
bool query() const
Definition: CUDAStream.h:117

c10::cuda::CUDAStream::id
StreamId id() const
Return the stream ID corresponding to this particular stream.
Definition: CUDAStream.h:113

c10::cuda::CUDAStream::priority
int priority() const
Definition: CUDAStream.h:138

c10::cuda::CUDAStream::CUDAStream
CUDAStream(Unchecked, Stream stream)
Construct a CUDAStream from a Stream with no error checking.
Definition: CUDAStream.h:75

c10::cuda::CUDAStream::device
Device device() const
Get the full Device that this stream is associated with.
Definition: CUDAStream.h:108

c10::cuda::CUDAStream::Unchecked
Unchecked
Definition: CUDAStream.h:64

c10::cuda::CUDAStream::priority_range
static std::tuple< int, int > priority_range()
Definition: CUDAStream.h:174

c10::cuda::CUDAStream::unpack3
static CUDAStream unpack3(StreamId stream_id, DeviceIndex device_index, DeviceType device_type)
Definition: CUDAStream.h:167

c10::cuda::CUDAStream::synchronize
void synchronize() const
Definition: CUDAStream.h:133

c10::cuda::CUDAStream::device_index
DeviceIndex device_index() const
Get the CUDA device index that this stream is associated with.
Definition: CUDAStream.h:102

c10::cuda::CUDAStream::CUDAStream
CUDAStream(Stream stream)
Construct a CUDAStream from a Stream.
Definition: CUDAStream.h:68

c10::cuda::CUDAStream::operator!=
bool operator!=(const CUDAStream &other) const noexcept
Definition: CUDAStream.h:81

c10::cuda::CUDAStream::operator==
bool operator==(const CUDAStream &other) const noexcept
Definition: CUDAStream.h:77

c10::cuda::getDefaultCUDAStream
CUDAStream getDefaultCUDAStream(DeviceIndex device_index=-1)
Get the default CUDA stream, for the passed CUDA device, or for the current device if no device index...

c10::cuda::getStreamFromExternal
CUDAStream getStreamFromExternal(cudaStream_t ext_stream, DeviceIndex device_index)
Get a CUDAStream from a externally allocated one.

c10::cuda::getStreamFromPool
CUDAStream getStreamFromPool(const bool isHighPriority=false, DeviceIndex device=-1)
Get a new stream from the CUDA stream pool.

c10::cuda::setCurrentCUDAStream
void setCurrentCUDAStream(CUDAStream stream)
Set the current stream on the device of the passed in stream to be the passed in stream.

c10::cuda::operator<<
std::ostream & operator<<(std::ostream &stream, const CUDAStream &s)

c10::cuda::getCurrentCUDAStream
CUDAStream getCurrentCUDAStream(DeviceIndex device_index=-1)
Get the current CUDA stream, for the passed CUDA device, or for the current device if no device index...

c10
Definition: ivalue.h:27

c10::DeviceIndex
int8_t DeviceIndex
An index representing a specific device; e.g., the 1 in GPU 1.
Definition: Device.h:18

c10::DeviceType
DeviceType
Definition: DeviceType.h:33

std
Definition: Device.h:181

c10::Device
Represents a a compute device on which a tensor is located.
Definition: Device.h:30