Rate this Page

Program Listing for File CUDAStream.h#

Return to documentation for file (c10/cuda/CUDAStream.h)

#pragma once#include<cuda_runtime_api.h>#include<c10/core/DeviceGuard.h>#include<c10/core/Stream.h>#include<c10/cuda/CUDAFunctions.h>#include<c10/util/Exception.h>/* * Stream pool note. * * A CUDAStream is an abstraction of an actual cuStream on the GPU. CUDAStreams * are backed by cuStreams, but they use several pools to minimize the costs * associated with creating, retaining, and destroying cuStreams. * * There are three pools per device, and a device's pools are lazily created. * * The first pool contains only the default stream. When the default stream * is requested it's returned. * * The second pool is the "low priority" or "default priority" streams. In * HIP builds there is no distinction between streams in this pool and streams * in the third pool (below). There are 32 of these streams per device, and * when a stream is requested one of these streams is returned round-robin. * That is, the first stream requested is at index 0, the second at index 1... * to index 31, then index 0 again. * * This means that if 33 low priority streams are requested, the first and * last streams requested are actually the same stream (under the covers) * and kernels enqueued on them cannot run concurrently. * * The third pool is the "high priority" streams. The third pool acts like * the second pool except the streams are created with a higher priority. * * These pools suggest that stream users should prefer many short-lived streams, * as the cost of acquiring and releasing streams is effectively zero. If * many longer-lived streams are required in performance critical scenarios * then the functionality here may need to be extended to allow, for example, * "reserving" a subset of the pool so that other streams do not accidentally * overlap the performance critical streams. * * Note: although the notion of "current stream for device" is thread local * (every OS thread has a separate current stream, as one might expect), * the stream pool is global across all threads; stream 0 is always stream 0 * no matter which thread you use it on.  Multiple threads can synchronize * on the same stream.  Although the CUDA documentation is not very clear * on the matter, streams are thread safe; e.g., it is safe to enqueue * a kernel on the same stream from two different threads. */namespacec10::cuda{staticconstexprintmax_compile_time_stream_priorities=4;// Value object representing a CUDA stream.  This is just a wrapper// around c10::Stream, but it comes with a little extra CUDA-specific// functionality (conversion to cudaStream_t), and a guarantee that// the wrapped c10::Stream really is a CUDA stream.classC10_CUDA_APICUDAStream{public:enumUnchecked{UNCHECKED};explicitCUDAStream(Streamstream):stream_(stream){TORCH_CHECK(stream_.device_type()==DeviceType::CUDA);}explicitCUDAStream(Unchecked/*unused*/,Streamstream):stream_(stream){}booloperator==(constCUDAStream&other)constnoexcept{returnunwrap()==other.unwrap();}booloperator!=(constCUDAStream&other)constnoexcept{returnunwrap()!=other.unwrap();}operatorcudaStream_t()const{returnstream();}operatorStream()const{returnunwrap();}DeviceTypedevice_type()const{returnDeviceType::CUDA;}DeviceIndexdevice_index()const{returnstream_.device_index();}Devicedevice()const{returnDevice(DeviceType::CUDA,device_index());}StreamIdid()const{returnstream_.id();}boolquery()const;voidsynchronize()const;intpriority()const{DeviceGuardguard{stream_.device()};intpriority=0;C10_CUDA_CHECK(cudaStreamGetPriority(stream(),&priority));returnpriority;}cudaStream_tstream()const;Streamunwrap()const{returnstream_;}structc10::StreamData3pack3()const{returnstream_.pack3();}// Unpack a CUDAStream from the 3 fields generated by pack().staticCUDAStreamunpack3(StreamIdstream_id,DeviceIndexdevice_index,DeviceTypedevice_type){returnCUDAStream(Stream::unpack3(stream_id,device_index,device_type));}staticstd::tuple<int,int>priority_range(){// Note: this returns the range of priority **supported by PyTorch**, not// the range of priority **supported by CUDA**. The former is a subset of// the latter.intleast_priority=0,greatest_priority=0;C10_CUDA_CHECK(cudaDeviceGetStreamPriorityRange(&least_priority,&greatest_priority));#ifdef USE_ROCM// See Note [HIP stream priorities]TORCH_INTERNAL_ASSERT(least_priority==1,"Unexpected HIP stream priority range");least_priority=0;#elseTORCH_INTERNAL_ASSERT(least_priority==0,"Unexpected CUDA stream priority range");#endifTORCH_INTERNAL_ASSERT(greatest_priority<=-1,"Unexpected CUDA stream priority range");greatest_priority=std::max(-c10::cuda::max_compile_time_stream_priorities+1,greatest_priority);returnstd::make_tuple(least_priority,greatest_priority);}// Deleted for now; use CUDAEvent::block instead// void synchronize_with(const CUDAEvent& event) const;private:Streamstream_;};C10_APICUDAStreamgetStreamFromPool(constboolisHighPriority=false,DeviceIndexdevice=-1);// no default priority to disambiguate overloadsC10_APICUDAStreamgetStreamFromPool(constintpriority,DeviceIndexdevice=-1);C10_APICUDAStreamgetStreamFromExternal(cudaStream_text_stream,DeviceIndexdevice_index);C10_APICUDAStreamgetDefaultCUDAStream(DeviceIndexdevice_index=-1);C10_APICUDAStreamgetCurrentCUDAStream(DeviceIndexdevice_index=-1);C10_APIvoidsetCurrentCUDAStream(CUDAStreamstream);C10_APIstd::ostream&operator<<(std::ostream&stream,constCUDAStream&s);}// namespace c10::cuda// hipify v2 backward compat in external projects#ifdef USE_ROCMnamespacec10::hip{usingc10::cuda::getStreamFromExternal;usingc10::cuda::getStreamFromPool;// must use inline wrappers instead of reference aliases due to default argsinlinec10::cuda::CUDAStreamgetDefaultHIPStream(DeviceIndexdevice_index=-1){returnc10::cuda::getDefaultCUDAStream(device_index);}inlinec10::cuda::CUDAStreamgetCurrentHIPStream(DeviceIndexdevice_index=-1){returnc10::cuda::getCurrentCUDAStream(device_index);}inlineauto&setCurrentHIPStream=c10::cuda::setCurrentCUDAStream;inlinec10::cuda::CUDAStreamgetStreamFromPoolMasqueradingAsCUDA(constboolisHighPriority=false,DeviceIndexdevice=-1){returnc10::cuda::getStreamFromPool(isHighPriority,device);}inlinec10::cuda::CUDAStreamgetStreamFromPoolMasqueradingAsCUDA(constintpriority,DeviceIndexdevice=-1){returnc10::cuda::getStreamFromPool(priority,device);}inlineauto&getStreamFromExternalMasqueradingAsCUDA=c10::cuda::getStreamFromExternal;inlinec10::cuda::CUDAStreamgetDefaultHIPStreamMasqueradingAsCUDA(DeviceIndexdevice_index=-1){returnc10::cuda::getDefaultCUDAStream(device_index);}inlinec10::cuda::CUDAStreamgetCurrentHIPStreamMasqueradingAsCUDA(DeviceIndexdevice_index=-1){returnc10::cuda::getCurrentCUDAStream(device_index);}inlineauto&setCurrentHIPStreamMasqueradingAsCUDA=c10::cuda::setCurrentCUDAStream;}// namespace c10::hip#endifnamespacestd{template<>structhash<c10::cuda::CUDAStream>{size_toperator()(c10::cuda::CUDAStreams)constnoexcept{returnstd::hash<c10::Stream>{}(s.unwrap());}};}// namespace std