#include <blockwise_gemm_dlops_v3.hpp>
|
| __device__ | BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 () |
| template<typename ABlockBuffer, typename BThreadBuffer, typename CThreadBuffer> |
| __device__ void | Run (const ABlockBuffer &a_block_buf, const BThreadBuffer &b_thread_buf, CThreadBuffer &c_thread_buf) const |
| template<typename ABlockSliceMoveStepIdx> |
| __device__ void | MoveABlockSliceWindow (const ABlockSliceMoveStepIdx &a_block_slice_move_step_idx) |
|
| static constexpr auto | I0 = Number<0>{} |
| static constexpr auto | I1 = Number<1>{} |
| static constexpr auto | I2 = Number<2>{} |
| static constexpr auto | I3 = Number<3>{} |
| static constexpr auto | I4 = Number<4>{} |
| static constexpr auto | E1 = ABlockDesc_E1_K1_E2{}.GetLength(I0) |
| static constexpr auto | KPerBlock = ABlockDesc_E1_K1_E2{}.GetLength(I1) |
| static constexpr auto | E2 = ABlockDesc_E1_K1_E2{}.GetLength(I2) |
| static constexpr auto | HoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I2) |
| static constexpr auto | WoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I3) |
| static constexpr auto | KPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I0) |
| static constexpr auto | HoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I2) |
| static constexpr auto | WoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I3) |
| static constexpr auto | a_thread_mtx_ |
| static constexpr auto | b_thread_mtx_ |
| static constexpr auto | c_thread_mtx_ |
◆ AIndex
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
◆ BIndex
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
◆ CIndex
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
◆ BlockwiseGemmDlops_km_kn_m0m1n0n1_v3()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
| __device__ ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v3< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_E1_K1_E2, BBlockDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo, EPerThreadLoop, KPerThreadLoop >::BlockwiseGemmDlops_km_kn_m0m1n0n1_v3 |
( |
| ) |
|
|
inline |
◆ GetBeginOfCThreadDesc_K_N_Ho_Wo()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
◆ GetCThreadDesc_K_N_Ho_WoLengths()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
| __device__ constexpr auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v3< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_E1_K1_E2, BBlockDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo, EPerThreadLoop, KPerThreadLoop >::GetCThreadDesc_K_N_Ho_WoLengths |
( |
| ) |
|
|
inlinestaticconstexpr |
◆ MoveABlockSliceWindow()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
template<typename ABlockSliceMoveStepIdx>
| __device__ void ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v3< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_E1_K1_E2, BBlockDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo, EPerThreadLoop, KPerThreadLoop >::MoveABlockSliceWindow |
( |
const ABlockSliceMoveStepIdx & | a_block_slice_move_step_idx | ) |
|
|
inline |
◆ Run()
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
template<typename ABlockBuffer, typename BThreadBuffer, typename CThreadBuffer>
| __device__ void ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v3< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_E1_K1_E2, BBlockDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo, EPerThreadLoop, KPerThreadLoop >::Run |
( |
const ABlockBuffer & | a_block_buf, |
|
|
const BThreadBuffer & | b_thread_buf, |
|
|
CThreadBuffer & | c_thread_buf ) const |
|
inline |
◆ a_thread_mtx_
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
Initial value:
integral_constant< index_t, N > Number
Definition number.hpp:12
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
◆ b_thread_mtx_
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
Initial value:=
Definition blockwise_gemm_dlops_v3.hpp:22
◆ c_thread_mtx_
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
◆ E1
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v3< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_E1_K1_E2, BBlockDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo, EPerThreadLoop, KPerThreadLoop >::E1 = ABlockDesc_E1_K1_E2{}.GetLength(I0) |
|
staticconstexpr |
◆ E2
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v3< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_E1_K1_E2, BBlockDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo, EPerThreadLoop, KPerThreadLoop >::E2 = ABlockDesc_E1_K1_E2{}.GetLength(I2) |
|
staticconstexpr |
◆ HoPerBlock
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v3< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_E1_K1_E2, BBlockDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo, EPerThreadLoop, KPerThreadLoop >::HoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I2) |
|
staticconstexpr |
◆ HoPerThread
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v3< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_E1_K1_E2, BBlockDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo, EPerThreadLoop, KPerThreadLoop >::HoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I2) |
|
staticconstexpr |
◆ I0
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
◆ I1
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
◆ I2
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
◆ I3
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
◆ I4
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
◆ KPerBlock
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v3< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_E1_K1_E2, BBlockDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo, EPerThreadLoop, KPerThreadLoop >::KPerBlock = ABlockDesc_E1_K1_E2{}.GetLength(I1) |
|
staticconstexpr |
◆ KPerThread
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v3< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_E1_K1_E2, BBlockDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo, EPerThreadLoop, KPerThreadLoop >::KPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I0) |
|
staticconstexpr |
◆ WoPerBlock
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v3< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_E1_K1_E2, BBlockDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo, EPerThreadLoop, KPerThreadLoop >::WoPerBlock = BBlockDesc_E1_N_Ho_Wo_E2{}.GetLength(I3) |
|
staticconstexpr |
◆ WoPerThread
template<
index_t BlockSize, typename FloatA, typename FloatB, typename FloatC, typename ABlockDesc_E1_K1_E2, typename BBlockDesc_E1_N_Ho_Wo_E2, typename CThreadDesc_K_N_Ho_Wo,
index_t EPerThreadLoop,
index_t KPerThreadLoop>
| auto ck::BlockwiseGemmDlops_km_kn_m0m1n0n1_v3< BlockSize, FloatA, FloatB, FloatC, ABlockDesc_E1_K1_E2, BBlockDesc_E1_N_Ho_Wo_E2, CThreadDesc_K_N_Ho_Wo, EPerThreadLoop, KPerThreadLoop >::WoPerThread = CThreadDesc_K_N_Ho_Wo{}.GetLength(I3) |
|
staticconstexpr |
The documentation for this struct was generated from the following file: