/* * Copyright 1993-2020 NVIDIA Corporation. All rights reserved. * * NOTICE TO LICENSEE: * * This source code and/or documentation ("Licensed Deliverables") are * subject to NVIDIA intellectual property rights under U.S. and * international Copyright laws. * * These Licensed Deliverables contained herein is PROPRIETARY and * CONFIDENTIAL to NVIDIA and is being provided under the terms and * conditions of a form of NVIDIA software license agreement by and * between NVIDIA and Licensee ("License Agreement") or electronically * accepted by Licensee. Notwithstanding any terms or conditions to * the contrary in the License Agreement, reproduction or disclosure * of the Licensed Deliverables to any third party without the express * written consent of NVIDIA is prohibited. * * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE * OF THESE LICENSED DELIVERABLES. * * U.S. Government End Users. These Licensed Deliverables are a * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT * 1995), consisting of "commercial computer software" and "commercial * computer software documentation" as such terms are used in 48 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government * only as a commercial end item. Consistent with 48 C.F.R.12.212 and * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all * U.S. Government End Users acquire the Licensed Deliverables with * only those rights set forth herein. * * Any use of the Licensed Deliverables in individual and commercial * software must include, in the user documentation and internal * comments to the code, the above Disclaimer and U.S. Government End * Users Notice. */ #ifndef _CUDA_PIPELINE #define _CUDA_PIPELINE #include #include _LIBCUDACXX_BEGIN_NAMESPACE_CUDA template class pipeline; enum class pipeline_role { producer, consumer }; template struct __pipeline_stage { barrier<_Scope> __produced; barrier<_Scope> __consumed; }; template class pipeline_shared_state { public: pipeline_shared_state() = default; pipeline_shared_state(const pipeline_shared_state &) = delete; pipeline_shared_state(pipeline_shared_state &&) = delete; pipeline_shared_state & operator=(pipeline_shared_state &&) = delete; pipeline_shared_state & operator=(const pipeline_shared_state &) = delete; private: __pipeline_stage<_Scope> __stages[_Stages_count]; atomic __refcount; template friend class pipeline; template friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state); template friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, size_t __producer_count); template friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, pipeline_role __role); }; struct __pipeline_asm_helper { __device__ static inline uint32_t __lane_id() { uint32_t __lane_id; asm volatile ("mov.u32 %0, %%laneid;" : "=r"(__lane_id)); return __lane_id; } }; template class pipeline { public: pipeline(pipeline &&) = default; pipeline(const pipeline &) = delete; pipeline & operator=(pipeline &&) = delete; pipeline & operator=(const pipeline &) = delete; _LIBCUDACXX_INLINE_VISIBILITY ~pipeline() { if (__active) { (void)quit(); } } _LIBCUDACXX_INLINE_VISIBILITY bool quit() { #ifdef __CUDA_ARCH__ const uint32_t __match_mask = __match_any_sync(__activemask(), reinterpret_cast(__shared_state_get_refcount())); const uint32_t __elected_id = __ffs(__match_mask) - 1; const bool __elected = (__pipeline_asm_helper::__lane_id() == __elected_id); const uint32_t __sub_count = __popc(__match_mask); #else const bool __elected = true; const uint32_t __sub_count = 1; #endif bool __released = false; if (__elected) { const uint32_t __old = __shared_state_get_refcount()->fetch_sub(__sub_count); const bool __last = (__old == __sub_count); if (__last) { for (uint8_t __stage = 0; __stage < __stages_count; ++__stage) { __shared_state_get_stage(__stage)->__produced.~barrier(); if (__partitioned) { __shared_state_get_stage(__stage)->__consumed.~barrier(); } } __released = true; } } __active = false; return __released; } _LIBCUDACXX_INLINE_VISIBILITY void producer_acquire() { if (__partitioned) { barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__head)->__consumed; (void)_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(__stage_barrier, __consumed_phase_parity)); } } _LIBCUDACXX_INLINE_VISIBILITY void producer_commit() { barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__head)->__produced; __memcpy_async_synchronize(__stage_barrier, true); (void)__stage_barrier.arrive(); if (++__head == __stages_count) { __head = 0; if (__partitioned) { __consumed_phase_parity = !__consumed_phase_parity; } } } _LIBCUDACXX_INLINE_VISIBILITY void consumer_wait() { barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__tail)->__produced; (void)_CUDA_VSTD::__libcpp_thread_poll_with_backoff(__poll_tester(__stage_barrier, __produced_phase_parity)); } _LIBCUDACXX_INLINE_VISIBILITY void consumer_release() { if (__partitioned) { (void)__shared_state_get_stage(__tail)->__consumed.arrive(); } if (++__tail == __stages_count) { __tail = 0; __produced_phase_parity = !__produced_phase_parity; } } template _LIBCUDACXX_INLINE_VISIBILITY bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period> & __duration) { barrier<_Scope> & __stage_barrier = __shared_state_get_stage(__tail)->__produced; return _CUDA_VSTD::__libcpp_thread_poll_with_backoff( __poll_tester(__stage_barrier, __produced_phase_parity), _CUDA_VSTD::chrono::duration_cast<_CUDA_VSTD::chrono::nanoseconds>(__duration) ); } template _LIBCUDACXX_INLINE_VISIBILITY bool consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration> & __time_point) { return consumer_wait_for(__time_point - _Clock::now()); } private: uint8_t __head : 8; uint8_t __tail : 8; const uint8_t __stages_count : 8; bool __consumed_phase_parity : 1; bool __produced_phase_parity : 1; bool __active : 1; const bool __partitioned : 1; char * const __shared_state; _LIBCUDACXX_INLINE_VISIBILITY pipeline(char * __shared_state, uint8_t __stages_count, bool __partitioned) : __head(0) , __tail(0) , __stages_count(__stages_count) , __consumed_phase_parity(true) , __produced_phase_parity(false) , __active(true) , __partitioned(__partitioned) , __shared_state(__shared_state) {} _LIBCUDACXX_INLINE_VISIBILITY static bool __barrier_try_wait_parity_impl(barrier<_Scope> & __barrier, bool __phase_parity) { typename barrier<_Scope>::arrival_token __synthesized_token = (__phase_parity ? 1ull : 0ull) << 63; return __barrier.__try_wait(_CUDA_VSTD::move(__synthesized_token)); } _LIBCUDACXX_INLINE_VISIBILITY static bool __barrier_try_wait_parity(barrier<_Scope> & __barrier, bool __phase_parity) { return __barrier_try_wait_parity_impl(__barrier, __phase_parity); } struct __poll_tester { barrier<_Scope> & __barrier; bool __phase_parity; _LIBCUDACXX_INLINE_VISIBILITY __poll_tester(barrier<_Scope> & __barrier, bool __phase_parity) : __barrier(__barrier) , __phase_parity(__phase_parity) {} _LIBCUDACXX_INLINE_VISIBILITY bool operator()() const { return __barrier_try_wait_parity(__barrier, __phase_parity); } }; _LIBCUDACXX_INLINE_VISIBILITY __pipeline_stage<_Scope> * __shared_state_get_stage(uint8_t __stage) { ptrdiff_t __stage_offset = __stage * sizeof(__pipeline_stage<_Scope>); return reinterpret_cast<__pipeline_stage<_Scope>*>(__shared_state + __stage_offset); } _LIBCUDACXX_INLINE_VISIBILITY atomic * __shared_state_get_refcount() { ptrdiff_t __refcount_offset = __stages_count * sizeof(__pipeline_stage<_Scope>); return reinterpret_cast*>(__shared_state + __refcount_offset); } template friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state); template friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, size_t __producer_count); template friend _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Pipeline_scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Pipeline_scope, _Pipeline_stages_count> * __shared_state, pipeline_role __role); }; template<> _LIBCUDACXX_INLINE_VISIBILITY bool pipeline::__barrier_try_wait_parity(barrier & __barrier, bool __phase_parity) { #if __CUDA_ARCH__ >= 800 if (__isShared(&__barrier)) { uint64_t * __mbarrier = device::barrier_native_handle(__barrier); uint16_t __wait_complete; asm volatile ("{" " .reg .pred %p;" " mbarrier.test_wait.parity.shared.b64 %p, [%1], %2;" " selp.u16 %0, 1, 0, %p;" "}" : "=h"(__wait_complete) : "r"(static_cast(__cvta_generic_to_shared(__mbarrier))), "r"(static_cast(__phase_parity)) : "memory"); return bool(__wait_complete); } else #endif { return __barrier_try_wait_parity_impl(__barrier, __phase_parity); } } template _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state) { const uint32_t __group_size = static_cast(__group.size()); const uint32_t __thread_rank = static_cast(__group.thread_rank()); if (__thread_rank == 0) { for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) { init(&__shared_state->__stages[__stage].__produced, __group_size); } __shared_state->__refcount.store(__group_size, std::memory_order_relaxed); } __group.sync(); return pipeline<_Scope>(reinterpret_cast(__shared_state->__stages), _Stages_count, false); } template _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state, size_t __producer_count) { const uint32_t __group_size = static_cast(__group.size()); const uint32_t __thread_rank = static_cast(__group.thread_rank()); if (__thread_rank == 0) { const size_t __consumer_count = __group_size - __producer_count; for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) { init(&__shared_state->__stages[__stage].__consumed, __consumer_count); init(&__shared_state->__stages[__stage].__produced, __producer_count); } __shared_state->__refcount.store(__group_size, std::memory_order_relaxed); } __group.sync(); return pipeline<_Scope>(reinterpret_cast(__shared_state->__stages), _Stages_count, true); } template _LIBCUDACXX_INLINE_VISIBILITY pipeline<_Scope> make_pipeline(const _Group & __group, pipeline_shared_state<_Scope, _Stages_count> * __shared_state, pipeline_role __role) { const uint32_t __group_size = static_cast(__group.size()); const uint32_t __thread_rank = static_cast(__group.thread_rank()); if (__thread_rank == 0) { __shared_state->__refcount.store(0, std::memory_order_relaxed); } __group.sync(); if (__role == pipeline_role::producer) { #ifdef __CUDA_ARCH__ const uint32_t __match_mask = __match_any_sync(__activemask(), reinterpret_cast(&__shared_state->__refcount)); const uint32_t __elected_id = __ffs(__match_mask) - 1; const bool __elected = (__pipeline_asm_helper::__lane_id() == __elected_id); const uint32_t __add_count = __popc(__match_mask); #else const bool __elected = true; const uint32_t __add_count = 1; #endif if (__elected) { (void)__shared_state->__refcount.fetch_add(__add_count, std::memory_order_relaxed); } } __group.sync(); if (__thread_rank == 0) { const uint32_t __producer_count = __shared_state->__refcount.load(std::memory_order_relaxed); const uint32_t __consumer_count = __group_size - __producer_count; for (uint8_t __stage = 0; __stage < _Stages_count; ++__stage) { init(&__shared_state->__stages[__stage].__consumed, __consumer_count); init(&__shared_state->__stages[__stage].__produced, __producer_count); } __shared_state->__refcount.store(__group_size, std::memory_order_relaxed); } __group.sync(); return pipeline<_Scope>(reinterpret_cast(__shared_state->__stages), _Stages_count, true); } _LIBCUDACXX_END_NAMESPACE_CUDA _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE template __device__ void __pipeline_consumer_wait(pipeline & __pipeline); __device__ inline void __pipeline_consumer_wait(pipeline & __pipeline, uint8_t __prior); _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE _LIBCUDACXX_BEGIN_NAMESPACE_CUDA template<> class pipeline { public: pipeline(pipeline &&) = default; pipeline(const pipeline &) = delete; pipeline & operator=(pipeline &&) = delete; pipeline & operator=(const pipeline &) = delete; _LIBCUDACXX_INLINE_VISIBILITY ~pipeline() {} _LIBCUDACXX_INLINE_VISIBILITY bool quit() { return true; } _LIBCUDACXX_INLINE_VISIBILITY void producer_acquire() {} _LIBCUDACXX_INLINE_VISIBILITY void producer_commit() { #if __CUDA_ARCH__ >= 800 asm volatile ("cp.async.commit_group;"); ++__head; #endif } _LIBCUDACXX_INLINE_VISIBILITY void consumer_wait() { #if __CUDA_ARCH__ >= 800 if (__head == __tail) { return; } const uint8_t __prior = __head - __tail - 1; device::__pipeline_consumer_wait(*this, __prior); ++__tail; #endif } _LIBCUDACXX_INLINE_VISIBILITY void consumer_release() {} template _LIBCUDACXX_INLINE_VISIBILITY bool consumer_wait_for(const _CUDA_VSTD::chrono::duration<_Rep, _Period> & __duration) { (void)__duration; consumer_wait(); return true; } template _LIBCUDACXX_INLINE_VISIBILITY bool consumer_wait_until(const _CUDA_VSTD::chrono::time_point<_Clock, _Duration> & __time_point) { (void)__time_point; consumer_wait(); return true; } private: uint8_t __head; uint8_t __tail; _LIBCUDACXX_INLINE_VISIBILITY pipeline() : __head(0) , __tail(0) {} friend _LIBCUDACXX_INLINE_VISIBILITY inline pipeline make_pipeline(); template friend _LIBCUDACXX_INLINE_VISIBILITY void pipeline_consumer_wait_prior(pipeline & __pipeline); }; _LIBCUDACXX_END_NAMESPACE_CUDA _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE template __device__ void __pipeline_consumer_wait(pipeline & __pipeline) { (void)__pipeline; #if __CUDA_ARCH__ >= 800 constexpr uint8_t __max_prior = 8; asm volatile ("cp.async.wait_group %0;" : : "n"(_Prior < __max_prior ? _Prior : __max_prior)); #endif } __device__ inline void __pipeline_consumer_wait(pipeline & __pipeline, uint8_t __prior) { switch (__prior) { case 0: device::__pipeline_consumer_wait<0>(__pipeline); break; case 1: device::__pipeline_consumer_wait<1>(__pipeline); break; case 2: device::__pipeline_consumer_wait<2>(__pipeline); break; case 3: device::__pipeline_consumer_wait<3>(__pipeline); break; case 4: device::__pipeline_consumer_wait<4>(__pipeline); break; case 5: device::__pipeline_consumer_wait<5>(__pipeline); break; case 6: device::__pipeline_consumer_wait<6>(__pipeline); break; case 7: device::__pipeline_consumer_wait<7>(__pipeline); break; default: device::__pipeline_consumer_wait<8>(__pipeline); break; } } _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE _LIBCUDACXX_BEGIN_NAMESPACE_CUDA _LIBCUDACXX_INLINE_VISIBILITY inline pipeline make_pipeline() { return pipeline(); } template _LIBCUDACXX_INLINE_VISIBILITY void pipeline_consumer_wait_prior(pipeline & __pipeline) { #if __CUDA_ARCH__ >= 800 device::__pipeline_consumer_wait<_Prior>(__pipeline); __pipeline.__tail = __pipeline.__head - _Prior; #endif } template _LIBCUDACXX_INLINE_VISIBILITY void pipeline_producer_commit(pipeline & __pipeline, barrier<_Scope> & __barrier) { (void)__pipeline; #if __CUDA_ARCH__ >= 800 __memcpy_async_synchronize(__barrier, true); #endif } template _LIBCUDACXX_INLINE_VISIBILITY void __memcpy_async_synchronize(pipeline<_Scope> & __pipeline, bool __is_async) { // memcpy_async submissions never synchronize on their own in the pipeline path. (void)__pipeline; (void)__is_async; } template _LIBCUDACXX_INLINE_VISIBILITY void memcpy_async(_Group const & __group, _Type * __destination, _Type const * __source, std::size_t __size, pipeline<_Scope> & __pipeline) { // When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are // incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with // memcpy_async when compiling with GCC 4.8. // FIXME: remove the #if once GCC 4.8 is no longer supported. #if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408 static_assert(_CUDA_VSTD::is_trivially_copyable<_Type>::value, "memcpy_async requires a trivially copyable type"); #endif __memcpy_async(__group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __pipeline); } template _Alignment) ? alignof(_Type) : _Alignment> _LIBCUDACXX_INLINE_VISIBILITY void memcpy_async(_Group const & __group, _Type * __destination, _Type const * __source, aligned_size_t<_Alignment> __size, pipeline<_Scope> & __pipeline) { // When compiling with NVCC and GCC 4.8, certain user defined types that _are_ trivially copyable are // incorrectly classified as not trivially copyable. Remove this assertion to allow for their usage with // memcpy_async when compiling with GCC 4.8. // FIXME: remove the #if once GCC 4.8 is no longer supported. #if !defined(_LIBCUDACXX_COMPILER_GCC) || _GNUC_VER > 408 static_assert(_CUDA_VSTD::is_trivially_copyable<_Type>::value, "memcpy_async requires a trivially copyable type"); #endif __memcpy_async<_Larger_alignment>(__group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __pipeline); } template _LIBCUDACXX_INLINE_VISIBILITY void memcpy_async(_Type * __destination, _Type const * __source, _Size __size, pipeline<_Scope> & __pipeline) { memcpy_async(__single_thread_group{}, __destination, __source, __size, __pipeline); } template _LIBCUDACXX_INLINE_VISIBILITY void memcpy_async(_Group const & __group, void * __destination, void const * __source, std::size_t __size, pipeline<_Scope> & __pipeline) { __memcpy_async<1>(__group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __pipeline); } template _LIBCUDACXX_INLINE_VISIBILITY void memcpy_async(_Group const & __group, void * __destination, void const * __source, aligned_size_t<_Alignment> __size, pipeline<_Scope> & __pipeline) { __memcpy_async<_Alignment>(__group, reinterpret_cast(__destination), reinterpret_cast(__source), __size, __pipeline); } template _LIBCUDACXX_INLINE_VISIBILITY void memcpy_async(void * __destination, void const * __source, _Size __size, pipeline<_Scope> & __pipeline) { memcpy_async(__single_thread_group{}, __destination, __source, __size, __pipeline); } _LIBCUDACXX_END_NAMESPACE_CUDA #endif //_CUDA_PIPELINE