/*
 * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
 *
 * NOTICE TO LICENSEE:
 *
 * This source code and/or documentation ("Licensed Deliverables") are
 * subject to NVIDIA intellectual property rights under U.S. and
 * international Copyright laws.
 *
 * These Licensed Deliverables contained herein is PROPRIETARY and
 * CONFIDENTIAL to NVIDIA and is being provided under the terms and
 * conditions of a form of NVIDIA software license agreement by and
 * between NVIDIA and Licensee ("License Agreement") or electronically
 * accepted by Licensee.  Notwithstanding any terms or conditions to
 * the contrary in the License Agreement, reproduction or disclosure
 * of the Licensed Deliverables to any third party without the express
 * written consent of NVIDIA is prohibited.
 *
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
 * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
 * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
 * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
 * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
 * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
 * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
 * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
 * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
 * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
 * OF THESE LICENSED DELIVERABLES.
 *
 * U.S. Government End Users.  These Licensed Deliverables are a
 * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
 * 1995), consisting of "commercial computer software" and "commercial
 * computer software documentation" as such terms are used in 48
 * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
 * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
 * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
 * U.S. Government End Users acquire the Licensed Deliverables with
 * only those rights set forth herein.
 *
 * Any use of the Licensed Deliverables in individual and commercial
 * software must include, in the user documentation and internal
 * comments to the code, the above Disclaimer and U.S. Government End
 * Users Notice.
 */

#ifndef _CUDA_PIPELINE_H_
# define _CUDA_PIPELINE_H_

# include "cuda_pipeline_primitives.h"

# if !defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
#  error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
         -std=c++11 compiler option.
# endif

# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
#  include "cuda_awbarrier.h"
# endif

// Integration with libcu++'s cuda::barrier<cuda::thread_scope_block>.

# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
#  if defined(_LIBCUDACXX_CUDA_ABI_VERSION)
#   define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION _LIBCUDACXX_CUDA_ABI_VERSION
#  else
#   define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION 4
#  endif

#  define _LIBCUDACXX_PIPELINE_CONCAT(X, Y) X ## Y
#  define _LIBCUDACXX_PIPELINE_CONCAT2(X, Y) _LIBCUDACXX_PIPELINE_CONCAT(X, Y)
#  define _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE _LIBCUDACXX_PIPELINE_CONCAT2(__, _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION)

namespace cuda { inline namespace _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE {
    struct __block_scope_barrier_base;
}}

# endif

_CUDA_PIPELINE_BEGIN_NAMESPACE

template<size_t N, typename T>
_CUDA_PIPELINE_QUALIFIER
auto segment(T* ptr) -> T(*)[N];

class pipeline {
public:
    pipeline(const pipeline&) = delete;
    pipeline(pipeline&&) = delete;
    pipeline& operator=(const pipeline&) = delete;
    pipeline& operator=(pipeline&&) = delete;

    _CUDA_PIPELINE_QUALIFIER pipeline();
    _CUDA_PIPELINE_QUALIFIER size_t commit();
    _CUDA_PIPELINE_QUALIFIER void commit_and_wait();
    _CUDA_PIPELINE_QUALIFIER void wait(size_t batch);
    template<unsigned N>
    _CUDA_PIPELINE_QUALIFIER void wait_prior();

# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
    _CUDA_PIPELINE_QUALIFIER void arrive_on(awbarrier& barrier);
    _CUDA_PIPELINE_QUALIFIER void arrive_on(cuda::__block_scope_barrier_base& barrier);
# endif

private:
    size_t current_batch;
};

template<class T>
_CUDA_PIPELINE_QUALIFIER
void memcpy_async(T& dst, const T& src, pipeline& pipe);

template<class T, size_t DstN, size_t SrcN>
_CUDA_PIPELINE_QUALIFIER
void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe);

template<size_t N, typename T>
_CUDA_PIPELINE_QUALIFIER
auto segment(T* ptr) -> T(*)[N]
{
    return (T(*)[N])ptr;
}

_CUDA_PIPELINE_QUALIFIER
pipeline::pipeline()
    : current_batch(0)
{
}

_CUDA_PIPELINE_QUALIFIER
size_t pipeline::commit()
{
    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
    return this->current_batch++;
}

_CUDA_PIPELINE_QUALIFIER
void pipeline::commit_and_wait()
{
    (void)pipeline::commit();
    pipeline::wait_prior<0>();
}

_CUDA_PIPELINE_QUALIFIER
void pipeline::wait(size_t batch)
{
    const size_t prior = this->current_batch > batch ? this->current_batch - batch : 0;

    switch (prior) {
    case  0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); break;
    case  1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); break;
    case  2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); break;
    case  3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); break;
    case  4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); break;
    case  5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); break;
    case  6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); break;
    case  7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); break;
    default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); break;
    }
}

template<unsigned N>
_CUDA_PIPELINE_QUALIFIER
void pipeline::wait_prior()
{
    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<N>();
}

# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
_CUDA_PIPELINE_QUALIFIER
void pipeline::arrive_on(awbarrier& barrier)
{
    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(&barrier.barrier);
}

_CUDA_PIPELINE_QUALIFIER
void pipeline::arrive_on(cuda::__block_scope_barrier_base & barrier)
{
    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(reinterpret_cast<uint64_t *>(&barrier));
}
# endif

template<class T>
_CUDA_PIPELINE_QUALIFIER
void memcpy_async(T& dst, const T& src, pipeline& pipe)
{
    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&src) & (alignof(T) - 1)));
    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&dst) & (alignof(T) - 1)));

    if (__is_trivially_copyable(T)) {
        _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_relaxed<sizeof(T), alignof(T)>(
                reinterpret_cast<void*>(&dst), reinterpret_cast<const void*>(&src));
    } else {
        dst = src;
    }
}

template<class T, size_t DstN, size_t SrcN>
_CUDA_PIPELINE_QUALIFIER
void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe)
{
    constexpr size_t dst_size = sizeof(*dst);
    constexpr size_t src_size = sizeof(*src);
    static_assert(dst_size == 4 || dst_size == 8 || dst_size == 16, "Unsupported copy size.");
    static_assert(src_size <= dst_size, "Source size must be less than or equal to destination size.");
    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (dst_size - 1)));
    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (dst_size - 1)));

    if (__is_trivially_copyable(T)) {
        _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_strict<sizeof(*dst), sizeof(*src)>(
                reinterpret_cast<void*>(*dst), reinterpret_cast<const void*>(*src));
    } else {
        for (size_t i = 0; i < DstN; ++i) {
            (*dst)[i] = (i < SrcN) ? (*src)[i] : T();
        }
    }
}

_CUDA_PIPELINE_END_NAMESPACE

#endif /* !_CUDA_PIPELINE_H_ */