21template <
typename GridwiseOp,
26 typename AElementwiseOperation,
27 typename BElementwiseOperation,
28 typename CDEElementwiseOperation,
29 typename AGridDesc_AK0_M_AK1,
30 typename BGridDesc_BK0_N_BK1,
31 typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
32 typename EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
33 typename Block2CTileMap,
34 typename ComputePtrOffsetOfBatch,
35 bool HasMainKBlockLoop>
37#if CK_USE_LAUNCH_BOUNDS
41 const ADataType* __restrict__ p_a_grid,
42 const BDataType* __restrict__ p_b_grid,
44 EDataType* __restrict__ p_e_grid,
45 const AElementwiseOperation a_element_op,
46 const BElementwiseOperation b_element_op,
47 const CDEElementwiseOperation cde_element_op,
49 const AGridDesc_AK0_M_AK1 a_grid_desc,
50 const BGridDesc_BK0_N_BK1 b_grid_desc,
51 const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
52 ds_grid_desc_mblock_mperblock_nblock_nperblock,
53 const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
54 e_grid_desc_mblock_mperblock_nblock_nperblock_,
55 const Block2CTileMap block_2_ctile_map,
56 const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
58#if(defined(__gfx11__) || defined(__gfx12__))
60 const index_t num_blocks_per_batch =
61 __builtin_amdgcn_readfirstlane(
get_grid_size() / batch_count);
65 static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
67 static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
69 static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
71 const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
73 __shared__
char p_shared[GridwiseOp::SharedMemTrait::lds_size];
75 DsPointer p_ds_grid_grp;
77 static constexpr index_t NumDTensor =
78 DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
81 [&](
auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
83 GridwiseOp::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
84 p_b_grid + b_batch_offset,
86 p_e_grid + e_batch_offset,
90 ds_grid_desc_mblock_mperblock_nblock_nperblock,
91 e_grid_desc_mblock_mperblock_nblock_nperblock_,
104 ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
105 ignore = e_grid_desc_mblock_mperblock_nblock_nperblock_;
109 ignore = compute_ptr_offset_of_batch;
110 ignore = block_2_ctile_map;
114template <
typename GridwiseOp,
121 typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
122 typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
123 typename AElementwiseOperation,
124 typename BElementwiseOperation,
125 typename CDEElementwiseOperation,
126 typename ComputePtrOffsetOfBatch,
127 typename Block2CTileMap,
128 bool HasMainKBlockLoop>
130#if CK_USE_LAUNCH_BOUNDS
134 const ADataType* __restrict__ p_a_grid,
135 const BDataType* __restrict__ p_b_grid,
137 EDataType* __restrict__ p_e_grid,
139 const AGridDesc a_grid_desc,
140 const BGridDesc b_grid_desc,
141 const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
142 ds_grid_desc_mblock_mperblock_nblock_nperblock,
143 const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
144 e_grid_desc_mblock_mperblock_nblock_nperblock,
145 const AElementwiseOperation a_element_op,
146 const BElementwiseOperation b_element_op,
147 const CDEElementwiseOperation cde_element_op,
148 const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
149 const Block2CTileMap block_2_etile_map)
151#if(defined(__gfx11__) || defined(__gfx12__))
153 __shared__
char p_shared[GridwiseOp::SharedMemTrait::lds_size];
155 const index_t num_blocks_per_batch =
156 __builtin_amdgcn_readfirstlane(
get_grid_size() / batch_count);
160 static_cast<long_index_t>(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)));
162 static_cast<long_index_t>(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx)));
164 static_cast<long_index_t>(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx)));
166 const auto ds_batch_offset = compute_ptr_offset_of_batch.GetDsPtrOffset(g_idx);
168 static constexpr index_t NumDTensor =
169 DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock::Size();
171 DsPointer p_ds_grid_grp;
174 [&](
auto i) { p_ds_grid_grp(i) = p_ds_grid[i] + ds_batch_offset[i]; });
176 GridwiseOp::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
177 p_b_grid + b_batch_offset,
179 p_e_grid + e_batch_offset,
183 ds_grid_desc_mblock_mperblock_nblock_nperblock,
184 e_grid_desc_mblock_mperblock_nblock_nperblock,
200 ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
201 ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
202 ignore = block_2_etile_map;
203 ignore = compute_ptr_offset_of_batch;
207template <
typename GridwiseOp,
214 typename DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
215 typename EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
216 typename AElementwiseOperation,
217 typename BElementwiseOperation,
218 typename CDEElementwiseOperation,
219 typename Block2CTileMap,
220 bool HasMainKBlockLoop>
222#if CK_USE_LAUNCH_BOUNDS
226 const BDataType* __restrict__ p_b_grid,
228 EDataType* __restrict__ p_e_grid,
229 const AGridDesc a_grid_desc,
230 const BGridDesc b_grid_desc,
231 const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
232 ds_grid_desc_mblock_mperblock_nblock_nperblock,
233 const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
234 e_grid_desc_mblock_mperblock_nblock_nperblock,
235 const AElementwiseOperation a_element_op,
236 const BElementwiseOperation b_element_op,
237 const CDEElementwiseOperation cde_element_op,
238 const Block2CTileMap block_2_ctile_map)
240#if(defined(__gfx11__) || defined(__gfx12__))
241 __shared__
char p_shared[GridwiseOp::SharedMemTrait::lds_size];
243 GridwiseOp::template Run<HasMainKBlockLoop>(p_a_grid,
250 ds_grid_desc_mblock_mperblock_nblock_nperblock,
251 e_grid_desc_mblock_mperblock_nblock_nperblock,
263 ignore = ds_grid_desc_mblock_mperblock_nblock_nperblock;
264 ignore = e_grid_desc_mblock_mperblock_nblock_nperblock;
268 ignore = block_2_ctile_map;
275 typename AccDataType,
276 typename CShuffleDataType,
282 typename DsGridDesc_M_N,
283 typename EGridDesc_M_N,
285 typename AElementwiseOperation,
286 typename BElementwiseOperation,
287 typename CDEElementwiseOperation,
300 typename ABlockTransferThreadClusterLengths_K0_M_K1,
301 typename ABlockTransferThreadClusterArrangeOrder,
302 typename ABlockTransferSrcAccessOrder,
303 index_t ABlockTransferSrcVectorDim,
304 index_t ABlockTransferSrcScalarPerVector,
305 index_t ABlockTransferDstScalarPerVector_K1,
306 bool AThreadTransferSrcResetCoordinateAfterRun,
308 bool ABlockLdsExtraM,
309 typename BBlockTransferThreadClusterLengths_K0_N_K1,
310 typename BBlockTransferThreadClusterArrangeOrder,
311 typename BBlockTransferSrcAccessOrder,
312 index_t BBlockTransferSrcVectorDim,
313 index_t BBlockTransferSrcScalarPerVector,
314 index_t BBlockTransferDstScalarPerVector_K1,
315 bool BThreadTransferSrcResetCoordinateAfterRun,
317 bool BBlockLdsExtraN,
318 index_t CShuffleMRepeatPerShuffle,
319 index_t CShuffleNRepeatPerShuffle,
320 typename CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
321 index_t CDEShuffleBlockTransferScalarPerVector_NPerBlock,
322 index_t NumGemmKPrefetchStage = 1,
341 static constexpr auto MWaves = MPerBlock / (MRepeat * MPerWmma);
342 static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma);
343 static constexpr auto WmmaK =
K1 == 16 ? 32 : 16;
349 NumGemmKPrefetchStage,
357 constexpr auto a_block_desc = [&]() {
358 if constexpr(AEnableLds)
361 constexpr auto K0PerBlock = KPerBlock /
K1;
362 constexpr auto max_lds_align =
K1;
364 if constexpr(ABlockLdsExtraM)
378 constexpr auto A_KRow =
I2;
379 constexpr auto KWmmaPerblock = KPerBlock /
WmmaK;
380 constexpr auto K0PerWmma =
WmmaK / A_KRow /
K1;
405 constexpr auto b_block_desc = [&]() {
406 if constexpr(BEnableLds)
409 constexpr auto K0PerBlock = KPerBlock /
K1;
410 constexpr auto max_lds_align =
K1;
412 if constexpr(BBlockLdsExtraN)
426 constexpr auto B_KRow =
I2;
427 constexpr auto KWmmaPerblock = KPerBlock /
WmmaK;
428 constexpr auto K0PerWmma =
WmmaK / B_KRow /
K1;
453 constexpr auto a_block_copy_step = [&]() {
454 if constexpr(AEnableLds)
456 constexpr auto K0PerBlock = KPerBlock /
K1;
462 constexpr auto KWmmaPerBlock = KPerBlock /
WmmaK;
468 return a_block_copy_step;
473 constexpr auto b_block_copy_step = [&]() {
474 if constexpr(BEnableLds)
476 constexpr auto K0PerBlock = KPerBlock /
K1;
482 constexpr auto KWmmaPerBlock = KPerBlock /
WmmaK;
488 return b_block_copy_step;
492 template <
typename ABlockDesc_>
496 constexpr auto a_wave_desc = [&]() {
497 if constexpr(AEnableLds)
500 constexpr auto A_K0 = ABlockDesc_{}.GetLength(
I0);
501 constexpr auto A_K1 = ABlockDesc_{}.GetLength(
I2);
503 constexpr auto A_KRow =
I2;
505 constexpr auto A_KRow =
I1;
519 constexpr auto KWmma = ABlockDesc_{}.GetLength(
I0);
520 constexpr auto K0PerWmma = ABlockDesc_{}.GetLength(
I3);
521 constexpr auto A_KRow = ABlockDesc_{}.GetLength(
I4);
522 constexpr auto A_K1 = ABlockDesc_{}.GetLength(
I6);
536 template <
typename BBlockDesc_>
539 constexpr auto b_wave_desc = [&]() {
540 if constexpr(BEnableLds)
543 constexpr auto B_K0 = BBlockDesc_{}.GetLength(
I0);
544 constexpr auto B_K1 = BBlockDesc_{}.GetLength(
I2);
546 constexpr auto B_KRow =
I2;
548 constexpr auto B_KRow =
I1;
562 constexpr auto KWmma = BBlockDesc_{}.GetLength(
I0);
563 constexpr auto K0PerWmma = BBlockDesc_{}.GetLength(
I3);
564 constexpr auto B_KRow = BBlockDesc_{}.GetLength(
I4);
565 constexpr auto B_K1 = BBlockDesc_{}.GetLength(
I6);
580 __host__ __device__
static constexpr auto
584 constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
591 return c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat;
601 return static_cast<const DDataType*
>(
nullptr);
607 template <
typename Block2CTileMap>
608 __host__ __device__
static constexpr bool CheckValidity(
const AGridDesc& a_grid_desc,
609 const BGridDesc& b_grid_desc,
610 const EGridDesc_M_N& e_grid_desc_m_n,
611 const Block2CTileMap& block_2_ctile_map)
614 "wrong! K1 need to be known at compile-time");
616 static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) &&
617 (NPerBlock % (NRepeat * NPerWmma)) == 0,
618 "Invalid tuning param!");
620 const auto GetAProblemsizeMK = [&]() {
621 if constexpr(AEnableLds)
624 a_grid_desc.GetLength(
I0) * a_grid_desc.GetLength(
I2));
628 return make_tuple(a_grid_desc.GetLength(
I1) * a_grid_desc.GetLength(
I2) *
629 a_grid_desc.GetLength(
I5),
630 a_grid_desc.GetLength(
I0) * a_grid_desc.GetLength(
I3) *
631 a_grid_desc.GetLength(
I4) * a_grid_desc.GetLength(
I6));
635 const auto GetBProblemsizeNK = [&]() {
636 if constexpr(BEnableLds)
639 b_grid_desc.GetLength(
I0) * b_grid_desc.GetLength(
I2));
643 return make_tuple(b_grid_desc.GetLength(
I1) * b_grid_desc.GetLength(
I2) *
644 b_grid_desc.GetLength(
I5),
645 b_grid_desc.GetLength(
I0) * b_grid_desc.GetLength(
I3) *
646 b_grid_desc.GetLength(
I4) * b_grid_desc.GetLength(
I6));
650 const auto M = GetAProblemsizeMK()[
I0];
651 const auto N = GetBProblemsizeNK()[
I0];
652 const auto K = GetAProblemsizeMK()[
I1];
654 if(!(M == e_grid_desc_m_n.GetLength(
I0) && N == e_grid_desc_m_n.GetLength(
I1) &&
655 K == GetBProblemsizeNK()[
I1]))
659 printf(
"GridwiseOp: ABE descriptor dimension cross check failure\n");
664 if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
668 printf(
"GridwiseOp: Problemsize descriptor dimension check failure\n");
674 const auto num_k_loop = K / KPerBlock;
676 if(!GridwiseGemmPipe::IsSupported(num_k_loop))
681 if(!block_2_ctile_map.CheckValidity(e_grid_desc_m_n))
689 if(!(a_grid_desc.GetElementSpaceSize() *
sizeof(ADataType) <= TwoGB &&
690 b_grid_desc.GetElementSpaceSize() *
sizeof(BDataType) <= TwoGB &&
691 e_grid_desc_m_n.GetElementSpaceSize() *
sizeof(EDataType) <= TwoGB))
700 template <
typename Block2CTileMap>
701 __host__ __device__
static constexpr bool CheckValidity(
const AGridDesc& a_grid_desc,
702 const BGridDesc& b_grid_desc,
703 const DsGridDesc_M_N& ds_grid_desc_m_n,
704 const EGridDesc_M_N& e_grid_desc_m_n,
705 const Block2CTileMap& block_2_ctile_map)
708 "wrong! K1 need to be known at compile-time");
710 static_assert((MPerBlock % (MPerWmma * MRepeat) == 0) &&
711 (NPerBlock % (NRepeat * NPerWmma)) == 0,
712 "Invalid tuning param!");
714 const auto GetAProblemsizeMK = [&]() {
715 if constexpr(AEnableLds)
718 a_grid_desc.GetLength(
I0) * a_grid_desc.GetLength(
I2));
722 return make_tuple(a_grid_desc.GetLength(
I1) * a_grid_desc.GetLength(
I2) *
723 a_grid_desc.GetLength(
I5),
724 a_grid_desc.GetLength(
I0) * a_grid_desc.GetLength(
I3) *
725 a_grid_desc.GetLength(
I4) * a_grid_desc.GetLength(
I6));
729 const auto GetBProblemsizeNK = [&]() {
730 if constexpr(BEnableLds)
733 b_grid_desc.GetLength(
I0) * b_grid_desc.GetLength(
I2));
737 return make_tuple(b_grid_desc.GetLength(
I1) * b_grid_desc.GetLength(
I2) *
738 b_grid_desc.GetLength(
I5),
739 b_grid_desc.GetLength(
I0) * b_grid_desc.GetLength(
I3) *
740 b_grid_desc.GetLength(
I4) * b_grid_desc.GetLength(
I6));
744 const auto M = GetAProblemsizeMK()[
I0];
745 const auto N = GetBProblemsizeNK()[
I0];
746 const auto K = GetAProblemsizeMK()[
I1];
751 valid = valid && (M == ds_grid_desc_m_n[i].GetLength(
I0) &&
752 N == ds_grid_desc_m_n[i].GetLength(
I1));
759 printf(
"GridwiseOp: D descriptor dimension check failure\n");
764 if(!(M == e_grid_desc_m_n.GetLength(
I0) && N == e_grid_desc_m_n.GetLength(
I1) &&
765 K == GetBProblemsizeNK()[
I1]))
769 printf(
"GridwiseOp: ABE descriptor dimension cross check failure\n");
774 if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0))
778 printf(
"GridwiseOp: Problemsize descriptor dimension check failure\n");
784 const auto num_k_loop = K / KPerBlock;
786 if(!GridwiseGemmPipe::IsSupported(num_k_loop))
791 if(!block_2_ctile_map.CheckValidity(e_grid_desc_m_n))
799 if(!(a_grid_desc.GetElementSpaceSize() *
sizeof(ADataType) <= TwoGB &&
800 b_grid_desc.GetElementSpaceSize() *
sizeof(BDataType) <= TwoGB &&
801 e_grid_desc_m_n.GetElementSpaceSize() *
sizeof(EDataType) <= TwoGB))
811 const index_t num_loop = K / KPerBlock;
813 return GridwiseGemmPipe::CalculateHasMainLoop(num_loop);
817 template <
typename EGr
idDesc_M_N_>
818 __host__ __device__
static constexpr auto
821 const auto M = e_grid_desc_m_n.GetLength(
I0);
822 const auto N = e_grid_desc_m_n.GetLength(
I1);
824 const auto MBlock = M / MPerBlock;
825 const auto NBlock = N / NPerBlock;
834 return e_grid_desc_mblock_mperblock_nblock_nperblock;
838 template <
typename DsGr
idDesc_M_N_>
839 __host__ __device__
static constexpr auto
878 .GetElementSpaceSize();
898 template <
bool HasMainKBlockLoop,
typename Block2CTileMap = DefaultBlock2CTileMap>
899 __device__
static void Run(
const ADataType* __restrict__ p_a_grid,
900 const BDataType* __restrict__ p_b_grid,
902 EDataType* __restrict__ p_e_grid,
903 void* __restrict__ p_shared,
904 const AGridDesc& a_grid_desc,
905 const BGridDesc& b_grid_desc,
907 ds_grid_desc_mblock_mperblock_nblock_nperblock,
909 e_grid_desc_mblock_mperblock_nblock_nperblock,
910 const AElementwiseOperation& a_element_op,
911 const BElementwiseOperation& b_element_op,
912 const CDEElementwiseOperation& cde_element_op,
913 const Block2CTileMap& block_2_ctile_map)
919 p_a_grid, a_grid_desc.GetElementSpaceSize());
921 p_b_grid, b_grid_desc.GetElementSpaceSize());
926 ds_grid_desc_mblock_mperblock_nblock_nperblock[i].GetElementSpaceSize());
930 p_e_grid, e_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
935 if(!block_2_ctile_map.ValidCTileIndex(
937 make_tuple(e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(
I0),
938 e_grid_desc_mblock_mperblock_nblock_nperblock.GetLength(
I2))))
942 const index_t m_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[
I0] * MPerBlock);
943 const index_t n_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[
I1] * NPerBlock);
947 const auto K = [&](){
948 if constexpr(AEnableLds){
949 return a_grid_desc.GetLength(
I0) * a_grid_desc.GetLength(
I2);
952 return a_grid_desc.GetLength(
I0) * a_grid_desc.GetLength(
I3) *
953 a_grid_desc.GetLength(
I4) * a_grid_desc.GetLength(
I6);
960 auto a_block_trait = [&](){
962 if constexpr(AEnableLds)
964 constexpr auto K0PerBlock = KPerBlock/
K1;
966 static_cast<ADataType*
>(p_shared),
967 a_block_desc.GetElementSpaceSize());
969 auto a_blockwise_copy =
971 AElementwiseOperation,
975 ABlockTransferThreadClusterLengths_K0_M_K1,
976 ABlockTransferThreadClusterArrangeOrder,
979 decltype(a_grid_desc),
980 decltype(a_block_desc),
981 ABlockTransferSrcAccessOrder,
983 ABlockTransferSrcVectorDim,
985 ABlockTransferSrcScalarPerVector,
986 ABlockTransferDstScalarPerVector_K1,
989 AThreadTransferSrcResetCoordinateAfterRun,
991 NumGemmKPrefetchStage>(
999 return make_tuple(a_block_buf, a_blockwise_copy);
1005 constexpr auto KWmmaPerBlock = KPerBlock /
WmmaK;
1006 constexpr auto K0PerWmma =
WmmaK/2/K1Value;
1008 a_block_desc.GetElementSpaceSize());
1011 auto a_blockwise_copy =
1014 decltype(a_grid_desc),
1015 decltype(a_block_desc),
1025 ABlockTransferSrcScalarPerVector,
1026 AThreadTransferSrcResetCoordinateAfterRun,
1030 m_block_data_idx_on_grid/(
MWaves * MPerWmma),
1037 return make_tuple(a_block_buf, a_blockwise_copy);
1041 auto b_block_trait = [&](){
1042 if constexpr(BEnableLds)
1044 constexpr auto K0PerBlock = KPerBlock/
K1;
1047 b_block_desc.GetElementSpaceSize());
1049 auto b_blockwise_copy =
1051 BElementwiseOperation,
1055 BBlockTransferThreadClusterLengths_K0_N_K1,
1056 BBlockTransferThreadClusterArrangeOrder,
1059 decltype(b_grid_desc),
1060 decltype(b_block_desc),
1061 BBlockTransferSrcAccessOrder,
1063 BBlockTransferSrcVectorDim,
1065 BBlockTransferSrcScalarPerVector,
1066 BBlockTransferDstScalarPerVector_K1,
1069 BThreadTransferSrcResetCoordinateAfterRun,
1071 NumGemmKPrefetchStage>(
1079 return make_tuple(b_block_buf, b_blockwise_copy);
1085 constexpr auto KWmmaPerBlock = KPerBlock /
WmmaK;
1086 constexpr auto K0PerWmma =
WmmaK/2/K1Value;
1088 b_block_desc.GetElementSpaceSize());
1091 auto b_blockwise_copy =
1094 decltype(b_grid_desc),
1095 decltype(b_block_desc),
1105 BBlockTransferSrcScalarPerVector,
1106 BThreadTransferSrcResetCoordinateAfterRun,
1110 n_block_data_idx_on_grid/(
NWaves * NPerWmma),
1117 return make_tuple(b_block_buf, b_blockwise_copy);
1121 auto a_block_buf = a_block_trait()[
I0];
1122 auto a_blockwise_copy = a_block_trait()[
I1];
1124 auto b_block_buf = b_block_trait()[
I0];
1125 auto b_blockwise_copy = b_block_trait()[
I1];
1130 auto blockwise_gemm =
1157 const index_t KBlockMainLoop = __builtin_amdgcn_readfirstlane(K / KPerBlock);
1163 a_block_slice_copy_step,
1169 b_block_slice_copy_step,
1176 constexpr auto c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs =
1177 blockwise_gemm.GetCThreadDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs();
1180 constexpr auto c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp =
1181 blockwise_gemm.GetCBlockDescriptor_MRepeat_MWave_MSubGroup_NRepeat_NWave_NThreadPerSubGroup_MAccVgprs();
1183 constexpr auto MWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(
I1);
1184 constexpr auto MSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(
I2);
1185 constexpr auto NWave = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(
I4);
1186 constexpr auto NThreadPerSubGroup = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(
I5);
1187 constexpr auto MAccVgprs = c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs_tmp.GetLength(
I6);
1190 constexpr auto c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat =
1194 static_cast<CShuffleDataType*
>(p_shared),
1195 c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat.GetElementSpaceSize());
1198 c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat,
1210 NThreadPerSubGroup))),
1216 const auto c_thread_mtx_on_block = blockwise_gemm.CalculateCThreadOriginDataIndex(
I0,
I0);
1218 const index_t m_thread_data_on_block = c_thread_mtx_on_block[
I0];
1219 const index_t n_thread_data_on_block = c_thread_mtx_on_block[
I1];
1221 const auto m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor =
1227 const auto n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor =
1233 const auto m_thread_data_on_block_idx = m_thread_data_on_block_to_mrepeat_mwave_msubgroup_maccvgprs_adaptor.CalculateBottomIndex(
1236 const auto n_thread_data_on_block_idx = n_thread_data_on_block_to_nrepeat_nwave_nthreadpersubgroup_adaptor.CalculateBottomIndex(
1240 auto c_thread_copy_vgpr_to_lds =
1243 decltype(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs),
1244 decltype(c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs),
1246 Sequence<CShuffleMRepeatPerShuffle,
1249 CShuffleNRepeatPerShuffle,
1259 c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
1261 m_thread_data_on_block_idx[
I1],
1262 m_thread_data_on_block_idx[
I2],
1264 n_thread_data_on_block_idx[
I1],
1265 n_thread_data_on_block_idx[
I2],
1266 m_thread_data_on_block_idx[
I3]),
1271 tie(c_shuffle_block_desc_mshrepeat_mpershrepeat_nshrepeat_npershrepeat),
1273 [&](
auto i) ->
const auto&
1274 {
return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
1279 tie(c_shuffle_block_buf),
1281 [&](
auto i) ->
const auto&
1282 {
return ds_grid_buf[i]; },
1299 decltype(c_ds_desc_refs),
1300 decltype(
tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
1301 CDEElementwiseOperation,
1304 CShuffleMRepeatPerShuffle * MWave * MPerWmma,
1306 CShuffleNRepeatPerShuffle * NWave * NPerWmma>,
1307 CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
1311 CDEShuffleBlockTransferScalarPerVector_NPerBlock,
1318 idx_c_ds_block_begin,
1319 tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
1325 constexpr auto sfc_c_vgpr =
1328 Sequence<CShuffleMRepeatPerShuffle,
1331 CShuffleNRepeatPerShuffle,
1337 constexpr auto sfc_cde_global =
1341 CShuffleMRepeatPerShuffle * MWave * MPerWmma,
1343 CShuffleNRepeatPerShuffle * NWave * NPerWmma>>{};
1345 constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
1347 static_assert(num_access == sfc_cde_global.GetNumOfAccess(),
"wrong!");
1354 c_thread_copy_vgpr_to_lds.Run(c_thread_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
1355 sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
1357 c_block_desc_mrepeat_mwave_msubgroup_nrepeat_nwave_nthreadpersubgroup_maccvgprs,
1358 c_shuffle_block_buf);
1364 cde_shuffle_block_copy_lds_to_global.Run(
1367 tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
1370 if constexpr(access_id < num_access - 1)
1372 constexpr auto cde_global_step = sfc_cde_global.GetForwardStep(access_id);
1375 cde_shuffle_block_copy_lds_to_global.MoveSrcSliceWindow(
1376 c_ds_desc_refs, i +
I1, cde_global_step);
1380 cde_shuffle_block_copy_lds_to_global.MoveDstSliceWindow(
1381 tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
#define CK_MIN_BLOCK_PER_CU
Definition ck.hpp:31
#define CK_MAX_THREAD_PER_BLOCK
Definition ck.hpp:30
__host__ __device__ constexpr auto integer_least_multiple(X x, Y y)
Definition utility/math.hpp:78
__host__ __device__ constexpr T max(T x)
Definition utility/math.hpp:84
__host__ __device__ constexpr auto make_multi_index(Xs &&... xs)
Definition array_multi_index.hpp:15
typename uniform_sequence_gen< NSize, I >::type uniform_sequence_gen_t
Definition utility/sequence.hpp:928
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
__device__ index_t get_grid_size()
Definition get_id.hpp:49
__host__ __device__ constexpr auto container_concat(const X &x, const Ys &... ys)
Definition utility/container_helper.hpp:320
constexpr auto GridwiseGemmPipeline_Selector()
Definition gridwise_gemm_pipeline_selector.hpp:31
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition tensor_descriptor_helper.hpp:49
InMemoryDataOperationEnum
Definition ck.hpp:277
@ Set
Definition ck.hpp:278
__host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
Definition tensor_description/tensor_adaptor.hpp:425
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
__host__ __device__ constexpr auto make_freeze_transform(const LowerIndex &low_idx)
Definition multi_index_transform_helper.hpp:151
constexpr Tuple< Args &... > tie(Args &... args) noexcept
Definition utility/tuple.hpp:218
integral_constant< index_t, N > Number
Definition number.hpp:12
typename tuple_element< I, TTuple >::type tuple_element_t
Definition utility/tuple.hpp:208
__host__ __device__ constexpr auto make_merge_transform(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:55
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
__device__ uint32_t amd_wave_read_first_lane(uint32_t value)
Definition amd_wave_read_first_lane.hpp:100
__device__ index_t get_block_1d_id()
Definition get_id.hpp:47
__global__ void kernel_gemm_mupltipe_d_wmma_cshuffle(const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const Block2CTileMap block_2_ctile_map)
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:225
__global__ void kernel_grouped_conv_multiple_d_wmma_cshuffle(const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const index_t batch_count, const AGridDesc_AK0_M_AK1 a_grid_desc, const BGridDesc_BK0_N_BK1 b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_, const Block2CTileMap block_2_ctile_map, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch)
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:40
bool EnvIsEnabled(EnvVar)
Definition utility/env.hpp:140
__host__ __device__ constexpr auto make_naive_tensor_descriptor_aligned(const Tuple< Lengths... > &lengths, Align align)
Definition tensor_descriptor_helper.hpp:132
__host__ __device__ constexpr auto generate_tuple(F &&f, Number< N >)
Definition tuple_helper.hpp:21
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__global__ void kernel_contraction_multiple_d_wmma_cshuffle(const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsPointer p_ds_grid, EDataType *__restrict__ p_e_grid, const index_t batch_count, const AGridDesc a_grid_desc, const BGridDesc b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation a_element_op, const BElementwiseOperation b_element_op, const CDEElementwiseOperation cde_element_op, const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch, const Block2CTileMap block_2_etile_map)
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:133
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
LoopScheduler
Definition loop_scheduler.hpp:15
__device__ index_t get_thread_local_1d_id()
Definition get_id.hpp:41
typename sequence_merge< Sx, Sy >::type sequence_merge_t
Definition utility/sequence.hpp:925
int64_t long_index_t
Definition ck.hpp:300
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
__device__ void block_sync_lds()
Definition synchronization.hpp:16
PipelineVersion
Definition gridwise_gemm_pipeline_selector.hpp:18
@ v1
Definition gridwise_gemm_pipeline_selector.hpp:19
__host__ __device__ constexpr auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition multi_index_transform_helper.hpp:90
typename remove_cv< T >::type remove_cv_t
Definition type.hpp:295
__host__ __device__ constexpr auto make_dynamic_buffer(T *p, ElementSpaceSize element_space_size)
Definition dynamic_buffer.hpp:472
__host__ __device__ constexpr auto generate_tie(F &&f, Number< N >)
Definition tuple_helper.hpp:34
__host__ __device__ constexpr auto concat_tuple_of_reference(const Tuple< X &... > &tx, const Tuple< Y &... > &ty)
Definition tuple_helper.hpp:42
constexpr LoopScheduler make_default_loop_scheduler()
Definition loop_scheduler.hpp:20
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
Definition block_to_ctile_map.hpp:261
Definition blockwise_gemm_wmma.hpp:550
__host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_wmma.hpp:585
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:858
static constexpr auto lds_size
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:882
static constexpr auto max_lds_align
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:861
static constexpr auto b_block_space_size_aligned
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:867
static constexpr auto a_block_space_offset
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:872
static constexpr auto c_shuffle_block_space_offset
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:880
static constexpr auto c_shuffle_block_space_size
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:876
static constexpr auto b_block_space_offset
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:873
static constexpr auto a_block_space_size_aligned
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:863
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:326
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::NWaves static constexpr auto NWaves
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:342
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::MWaves static constexpr auto MWaves
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:341
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock remove_cvref_t< decltype(MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{}))> EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:891
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock remove_cvref_t< decltype(MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(DsGridDesc_M_N{}))> DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:888
__host__ static __device__ constexpr auto MakeBBlockDescriptor()
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:403
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::DsGridPointer decltype(MakeDsGridPointer()) DsGridPointer
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:896
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::ThisThreadBlock ThisThreadBlock< BlockSize > ThisThreadBlock
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:345
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::DefaultBlock2CTileMap remove_cvref_t< decltype(MakeDefaultBlock2CTileMap(EGridDesc_M_N{}, 1, 1))> DefaultBlock2CTileMap
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:894
__host__ static __device__ constexpr auto MakeAWaveDescriptor(const ABlockDesc_ &)
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:493
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::I5 static constexpr auto I5
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:334
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::I0 static constexpr auto I0
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:329
__host__ static __device__ constexpr bool CalculateHasMainKBlockLoop(index_t K)
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:809
__host__ static __device__ constexpr auto MakeBWaveDescriptor(const BBlockDesc_ &)
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:537
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::GridwiseGemmPipe remove_cvref_t< decltype(GridwiseGemmPipeline_Selector< PipelineVer, NumGemmKPrefetchStage, LoopSched, AEnableLds, BEnableLds >())> GridwiseGemmPipe
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:347
__host__ static __device__ constexpr auto MakeBBlockSliceCopyStep()
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:471
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::I6 static constexpr auto I6
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:335
__host__ static __device__ constexpr bool CheckValidity(const AGridDesc &a_grid_desc, const BGridDesc &b_grid_desc, const DsGridDesc_M_N &ds_grid_desc_m_n, const EGridDesc_M_N &e_grid_desc_m_n, const Block2CTileMap &block_2_ctile_map)
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:701
__host__ static __device__ constexpr auto MakeABlockDescriptor()
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:355
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::I1 static constexpr auto I1
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:330
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::K1 static constexpr auto K1
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:339
static __device__ void Run(const ADataType *__restrict__ p_a_grid, const BDataType *__restrict__ p_b_grid, DsGridPointer p_ds_grid, EDataType *__restrict__ p_e_grid, void *__restrict__ p_shared, const AGridDesc &a_grid_desc, const BGridDesc &b_grid_desc, const DsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock &ds_grid_desc_mblock_mperblock_nblock_nperblock, const EGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock &e_grid_desc_mblock_mperblock_nblock_nperblock, const AElementwiseOperation &a_element_op, const BElementwiseOperation &b_element_op, const CDEElementwiseOperation &cde_element_op, const Block2CTileMap &block_2_ctile_map)
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:899
__host__ static __device__ constexpr auto MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N_ &e_grid_desc_m_n)
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:819
__host__ static __device__ constexpr auto MakeDefaultBlock2CTileMap(const EGridDesc_M_N &e_grid_desc_m_n, index_t, index_t)
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:850
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::I3 static constexpr auto I3
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:332
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::I7 static constexpr auto I7
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:336
__host__ static __device__ constexpr auto MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const DsGridDesc_M_N_ &ds_grid_desc_m_n)
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:840
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::I4 static constexpr auto I4
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:333
static constexpr auto MakeDsGridPointer()
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:595
__host__ static __device__ constexpr bool CheckValidity(const AGridDesc &a_grid_desc, const BGridDesc &b_grid_desc, const EGridDesc_M_N &e_grid_desc_m_n, const Block2CTileMap &block_2_ctile_map)
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:608
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::I2 static constexpr auto I2
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:331
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::WmmaK static constexpr auto WmmaK
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:343
__host__ static __device__ constexpr auto MakeABlockSliceCopyStep()
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:451
ck::GridwiseGemmMultipleD_Wmma< ADataType, BDataType, AccDataType, CShuffleDataType, DsDataType, EDataType, AGridDesc, BGridDesc, DsGridDesc_M_N, EGridDesc_M_N, AElementwiseOperation, BElementwiseOperation, CDEElementwiseOperation, InMemoryDataOperationEnum::Set, MPerBlock, NPerBlock, KPerBlock, MPerWmma, NPerWmma, K1, MRepeat, NRepeat, BlockSize, ABlockTransferThreadClusterLengths_K0_M_K1, ABlockTransferThreadClusterArrangeOrder, ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim, ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_K1, false, AEnableLds, ABlockLdsAddExtraM, BBlockTransferThreadClusterLengths_K0_N_K1, BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder, BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector, BBlockTransferDstScalarPerVector_K1, false, BEnableLds, BBlockLdsAddExtraN, CShuffleMRepeatPerShuffle, CShuffleNRepeatPerShuffle, CDEShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock, CDEShuffleBlockTransferScalarPerVector_NPerBlock, NumPrefetch, LoopSched, PipelineVer >::NumDTensor static constexpr index_t NumDTensor
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:327
__host__ static __device__ constexpr auto GetCShuffleBlockDescriptor_MShRepeat_MPerShRepeat_NShRepeat_NPerShRepeat()
Definition gridwise_gemm_multiple_d_wmma_cshuffle.hpp:582
Definition utility/sequence.hpp:43
Definition tensor_space_filling_curve.hpp:20
Blockwise data transfer.
Definition thread_group_tensor_slice_transfer_v4r1.hpp:46
Definition thread_group_tensor_slice_transfer_v7.hpp:42
Definition threadwise_tensor_slice_transfer.hpp:39
Helper structure that facilitates transfer of source (grid) data to destination threads.
Definition threadwise_tensor_slice_transfer.hpp:234
Definition utility/tuple.hpp:117
Definition is_known_at_compile_time.hpp:14
Definition functional2.hpp:33
Definition tensor_operation/gpu/element/unary_element_wise_operation.hpp:340
#define CK_ENV(name)
Definition utility/env.hpp:129