device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp Source File#
device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
Go to the documentation of this file.
41 * strided batched, but we can easily extend to other layouts. The returned offset can be either \p
49 * \note Using \p ComputePtrOffsetOfBatch gives us the flexibility that 2 workgroups can compute 2
52 * device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk.hpp kernel_gemm_xdlops_v2r3_for_conv3d \endlink for \link
53 * DeviceConv3d \endlink uses the same concept, but currently does NOT encapsulate the computing of
57 * implementation we can avoid copy data to workspace before kernel launch since number of groups is
61 * \note \p Block2ETileMap allows customized mapping between a workgroup and the C-tile it computes.
62 * Together with \p ComputePtrOffsetOfBatch, we can reuse GridwiseGemm (and GridwiseGemm fusion ) to
609 static constexpr index_t ElementwiseBlocksize = ClusterLengthMPerBlock * ClusterLengthNPerBlock;
1898 << "TransposeTransferOutScalarPerVectorAligned: " << TransposeTransferOutScalarPerVectorAligned;
float launch_and_time_kernel(const StreamConfig &stream_config, F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
Definition host_utility/kernel_launch.hpp:14
float launch_and_time_kernel_with_preprocess(const StreamConfig &stream_config, PreProcessFunc preprocess, F kernel, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
Definition host_utility/kernel_launch.hpp:91
__host__ __device__ constexpr index_t gcd(index_t x, index_t y)
Definition utility/math.hpp:154
__host__ __device__ constexpr auto integer_divide_ceil(X x, Y y)
Definition utility/math.hpp:72
Definition convolution_backward_data_specialization.hpp:8
GemmSpecialization
Definition gemm_specialization.hpp:11
@ MNKPadding
Definition gemm_specialization.hpp:20
constexpr bool is_NGCDHW_NGKDHW()
Definition device_grouped_conv_utils.hpp:112
constexpr bool is_NGCHW_GKCYX_NGKHW()
Definition device_grouped_conv_utils.hpp:64
std::string getConvBackwardDataSpecializationString(const ConvolutionBackwardDataSpecialization &s)
Definition convolution_backward_data_specialization.hpp:17
constexpr bool is_NGCDHW_GKCZYX_NGKDHW()
Definition device_grouped_conv_utils.hpp:104
ConvolutionBackwardDataSpecialization
Definition convolution_backward_data_specialization.hpp:11
@ Filter1x1Stride1Pad0
Definition convolution_backward_data_specialization.hpp:13
constexpr bool is_NGCHW_NGKHW()
Definition device_grouped_conv_utils.hpp:72
Definition convolution_backward_data_specialization.hpp:7
CK_TILE_HOST float launch_kernel(const stream_config &s, Callables &&... callables)
Definition tile/host/kernel_launch.hpp:173
Definition ck.hpp:268
__global__ void kernel_batched_elementwise(const InGridDescTuple in_grid_desc_tuple, const OutGridDescTuple out_grid_desc_tuple, const InDataTypePointerTuple p_in_global_tuple, const OutDataTypePointerTuple p_out_global_tuple, const Block2TileMap block_2_tile_map, const ElementwiseOperation elementwise_op, const index_t batch_count, const std::array< index_t, NumInputs > input_batch_strides, const std::array< index_t, NumOutputs > output_batch_strides)
Definition gridwise_elementwise_2d.hpp:221
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
typename tuple_element< I, TTuple >::type tuple_element_t
Definition utility/tuple.hpp:208
__host__ __device__ constexpr auto make_merge_transform(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:55
__device__ uint32_t amd_wave_read_first_lane(uint32_t value)
Definition amd_wave_read_first_lane.hpp:100
__host__ __device__ constexpr Y type_convert(X x)
Definition utility/type_convert.hpp:98
__host__ __device__ constexpr auto generate_tuple(F &&f, Number< N >)
Definition tuple_helper.hpp:21
auto accumulate_n(ForwardIterator first, Size count, T init, BinaryOperation op) -> decltype(std::accumulate(first, std::next(first, count), init, op))
Definition library/utility/numeric.hpp:11
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__global__ void kernel_elementwise_batched_dual(const InAGridDescTuple in_grid_desc_tuple_a, const InBGridDescTuple in_grid_desc_tuple_b, const OutAGridDescTuple out_grid_desc_tuple_a, const OutBGridDescTuple out_grid_desc_tuple_b, const InADataTypePointerTuple p_in_global_tuple_a, const InBDataTypePointerTuple p_in_global_tuple_b, const OutADataTypePointerTuple p_out_global_tuple_a, const OutBDataTypePointerTuple p_out_global_tuple_b, const Block2TileMapA block_2_tile_map_a, const Block2TileMapB block_2_tile_map_b, const ElementwiseOperation elementwise_op, const index_t a_grid_size, const index_t batch_count_a, const index_t batch_count_b, const std::array< index_t, NumInputsA > input_batch_strides_a, const std::array< index_t, NumInputsB > input_batch_strides_b, const std::array< index_t, NumOutputsA > output_batch_strides_a, const std::array< index_t, NumOutputsB > output_batch_strides_b)
Definition gridwise_elementwise_2d.hpp:117
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
constexpr LoopScheduler make_default_loop_scheduler()
Definition loop_scheduler.hpp:20
bool is_bf16_atomic_supported()
Definition host_utility/device_prop.hpp:108
Definition ck/stream_config.hpp:10
Definition block_to_ctile_map.hpp:261
Definition gridwise_elementwise_2d.hpp:278
Definition gridwise_gemm_multiple_d_xdl_cshuffle.hpp:78
decltype(MakeDsGridPointer()) DsGridPointer
Definition gridwise_gemm_multiple_d_xdl_cshuffle.hpp:411
Definition block_to_ctile_map.hpp:872
Definition utility/sequence.hpp:43
Definition utility/tuple.hpp:117
Definition functional2.hpp:33
Definition tensor_operation/gpu/device/tensor_layout.hpp:238
Definition tensor_operation/gpu/device/tensor_layout.hpp:243
Definition tensor_operation/gpu/device/tensor_layout.hpp:135
Definition tensor_operation/gpu/device/tensor_layout.hpp:362
Definition tensor_operation/gpu/device/tensor_layout.hpp:130
Definition tensor_operation/gpu/device/tensor_layout.hpp:357
Definition transform_conv_bwd_data_to_gemm_v1.hpp:44
__host__ __device__ auto MakeADescriptor_AK0_M_AK1() const
Definition transform_conv_bwd_data_to_gemm_v1.hpp:659
IndexType N_
Definition transform_conv_bwd_data_to_gemm_v1.hpp:1508
__host__ __device__ auto MakeBDescriptor_BK0_N_BK1() const
Definition transform_conv_bwd_data_to_gemm_v1.hpp:943
__host__ __device__ auto MakeCDescriptor_M_N() const
Definition transform_conv_bwd_data_to_gemm_v1.hpp:1150
Definition transform_conv_ngchw_to_nhwgc.hpp:31
Definition device_base.hpp:197
void * p_workspace_
Definition device_base.hpp:204
BaseArgument()=default
BaseInvoker()=default
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:666
std::size_t GetWorkspaceETensorSizeBytes() const
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1044
index_t conv_N_per_block_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1092
index_t num_workgroups_per_Conv_N_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1126
std::vector< DsGridDesc_M_N > ds_grid_desc_m_n_container_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1095
NGCHWTransposeDescType e_out_transpose_desc_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1103
const index_t k_batch_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1125
index_t gemms_count_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1128
std::array< index_t, NDimSpatial+3 > a_g_n_k_wos_lengths_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1118
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::Argument::p_e_grid_
EDataType * p_e_grid_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1088
Argument(const void *p_a, const void *p_b, const std::array< const void *, NumDTensor > &p_ds, void *p_e, const std::array< index_t, NDimSpatial+3 > &a_g_n_k_wos_lengths, const std::array< index_t, NDimSpatial+3 > &a_g_n_k_wos_strides, const std::array< index_t, NDimSpatial+3 > &b_g_k_c_xs_lengths, const std::array< index_t, NDimSpatial+3 > &b_g_k_c_xs_strides, const std::array< std::array< index_t, NDimSpatial+3 >, NumDTensor > &ds_g_n_c_wis_lengths, const std::array< std::array< index_t, NDimSpatial+3 >, NumDTensor > &ds_g_n_c_wis_strides, const std::array< index_t, NDimSpatial+3 > &e_g_n_c_wis_lengths, const std::array< index_t, NDimSpatial+3 > &e_g_n_c_wis_strides, const std::array< index_t, NDimSpatial > &conv_filter_strides, const std::array< index_t, NDimSpatial > &conv_filter_dilations, const std::array< index_t, NDimSpatial > &input_left_pads, const std::array< index_t, NDimSpatial > &input_right_pads, const AElementwiseOp &a_element_op, const BElementwiseOp &b_element_op, const CDEElementwiseOp &cde_element_op, ck::index_t split_k=1)
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:667
GKCYXTransposeDescType b_in_transpose_desc_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1105
long_index_t e_space_size_bytes
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1132
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::Argument::p_a_grid_
const ADataType * p_a_grid_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1085
std::array< index_t, NDimSpatial > conv_filter_strides_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1121
std::array< index_t, NDimSpatial+3 > e_g_n_c_wis_lengths_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1120
std::vector< std::array< GemmArgs, MaxGroupedGemmGroupsNum > > gemm_kernel_args_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1129
std::array< index_t, NDimSpatial > input_left_pads_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1122
GridwiseGemm64::DsGridPointer p_ds_grid_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1087
Block2TileMapInOutElementwise elementwise_block_2_ctile_map_transpose_a_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1099
std::vector< AGridDesc_M_K > a_grid_desc_m_k_container_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1093
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::Argument::p_b_grid_
const BDataType * p_b_grid_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1086
Block2TileMapInOutElementwise elementwise_block_2_ctile_map_transpose_e_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1100
CDEElementwiseOp cde_element_op_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1116
ComputePtrOffsetOfStridedBatch< I1, I1, NumDTensor > compute_ptr_offset_of_batch_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1109
bool bwd_needs_zero_out
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1131
NHWGCTransposeDescType e_in_transpose_desc_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1104
AElementwiseOp a_element_op_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1114
std::vector< BGridDesc_N_K > b_grid_desc_n_k_container_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1094
void Print() const
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1064
std::vector< index_t > gemms_grid_size_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1127
NHWGCTransposeDescType a_out_transpose_desc_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1104
index_t num_group_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1091
std::array< index_t, NDimSpatial+3 > b_g_k_c_xs_lengths_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1119
std::array< index_t, NDimSpatial > input_right_pads_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1123
ComputePtrOffsetOfStridedBatch< I1, I1, I0 > compute_ptr_offset_of_workspace_n_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1111
std::vector< EGridDesc_M_N > e_grid_desc_m_n_container_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1096
std::size_t GetWorkspaceSizeBytes() const
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1058
std::size_t GetWorkspaceBTensorSizeBytes() const
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1029
NGCHWTransposeDescType a_in_transpose_desc_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1103
GKYXCTransposeDescType b_out_transpose_desc_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1106
std::size_t GetWorkspaceATensorSizeBytes() const
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1014
BElementwiseOp b_element_op_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1115
ComputePtrOffsetOfStridedBatch< I1, I1, I0 > compute_ptr_offset_of_n_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1110
Block2TileMapWeiElementwise elementwise_block_2_ctile_map_transpose_b_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1101
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:534
index_t BlockStart_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:572
DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:567
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::GemmArgs::BlockEnd_
index_t BlockEnd_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:572
bool HasMainKBlockLoop_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:573
EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:568
AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:564
GroupedGemmBlock2ETileMap block_2_ctile_map_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:571
GemmArgs()=default
BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:565
GemmArgs(AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1, BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1, DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock ds_grid_desc_mblock_mperblock_nblock_nperblock, EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock e_grid_desc_mblock_mperblock_nblock_nperblock, GroupedGemmBlock2ETileMap block_2_ctile_map, index_t BlockStart, index_t BlockEnd, bool HasMainKBlockLoop)
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:536
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1137
float RunImp(const Argument &arg, const StreamConfig &stream_config=StreamConfig{})
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1302
float RunMultiDGemm(const Argument &arg, const StreamConfig &stream_config=StreamConfig{})
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1143
DeviceOp::Argument Argument
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1138
float Run(const BaseArgument *p_arg, const StreamConfig &stream_config=StreamConfig{}) override
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1478
float Run(const Argument &arg, const StreamConfig &stream_config=StreamConfig{})
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1452
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:306
static constexpr auto NXdlPerWave32
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:324
static constexpr index_t ElementwiseBlocksize
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:609
static constexpr index_t TransposeTransferOutScalarPerVectorAligned
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:593
static constexpr auto conv_ngchw_to_nhwgc_transformer
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:583
std::unique_ptr< BaseInvoker > MakeInvokerPointer() override
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1866
ADataType ABDataType
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:333
static auto MakeArgument(const void *p_a, const void *p_b, const std::array< const void *, NumDTensor > &p_ds, void *p_e, const std::array< index_t, NDimSpatial+3 > &a_g_n_k_wos_lengths, const std::array< index_t, NDimSpatial+3 > &a_g_n_k_wos_strides, const std::array< index_t, NDimSpatial+3 > &b_g_k_c_xs_lengths, const std::array< index_t, NDimSpatial+3 > &b_g_k_c_xs_strides, const std::array< std::array< index_t, NDimSpatial+3 >, NumDTensor > &ds_g_n_c_wis_lengths, const std::array< std::array< index_t, NDimSpatial+3 >, NumDTensor > &ds_g_n_c_wis_strides, const std::array< index_t, NDimSpatial+3 > &e_g_n_c_wis_lengths, const std::array< index_t, NDimSpatial+3 > &e_g_n_c_wis_strides, const std::array< index_t, NDimSpatial > &conv_filter_strides, const std::array< index_t, NDimSpatial > &conv_filter_dilations, const std::array< index_t, NDimSpatial > &input_left_pads, const std::array< index_t, NDimSpatial > &input_right_pads, const AElementwiseOp &a_element_op, const BElementwiseOp &b_element_op, const CDEElementwiseOp &cde_element_op, const ck::index_t split_k=1)
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1773
static constexpr index_t ClusterLengthNPerBlock
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:580
remove_cvref_t< decltype(conv_ngchw_to_nhwgc_transformer .template MakeGKYXCTransposeDesc< NDimSpatial >({}, {}))> GKYXCTransposeDescType
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:605
std::conditional_t< is_NGCHW_NGKHW< ELayout, BLayout, ALayout >() &&NeedTransposeKernel, tensor_layout::convolution::NHWGC, std::conditional_t< is_NGCDHW_NGKDHW< ELayout, BLayout, ALayout >() &&NeedTransposeKernel, tensor_layout::convolution::NDHWGC, ELayout > > ELayoutAfterTranspose
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:368
remove_cvref_t< tuple_element_t< 3, ABDsEGridDesc > > EGridDesc_M_N
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:516
size_t GetWorkSpaceSize(const BaseArgument *p_arg) const override
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1907
remove_cvref_t< tuple_element_t< 2, ABDsEGridDesc > > DsGridDesc_M_N
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:515
remove_cvref_t< decltype(conv_ngchw_to_nhwgc_transformer .template MakeNGCHWTransposeDesc< NDimSpatial >({}, {}))> NGCHWTransposeDescType
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:596
decltype(transform_k0_m_k1_to_m_k(AGridDesc_AK0_M_AK1{})) AGridDesc_M_K
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:518
static constexpr ConvToGemmBwdDataTransform dummy_conv_to_gemm_transform
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:510
static constexpr auto I3
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:338
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::IsSupportedArgument
bool IsSupportedArgument(const BaseArgument *p_arg) override
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1767
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::NeedTransposeKernel
static constexpr bool NeedTransposeKernel
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:347
GridwiseGemmMultipleD_xdl_cshuffle< GridwiseGemmCTransposeTemplateParameters > GridwiseGemmCTransposeBase
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:475
std::conditional_t< is_NGCHW_NGKHW< ELayout, BLayout, ALayout >() &&NeedTransposeKernel, tensor_layout::convolution::NHWGK, std::conditional_t< is_NGCDHW_NGKDHW< ELayout, BLayout, ALayout >() &&NeedTransposeKernel, tensor_layout::convolution::NDHWGK, ALayout > > ALayoutAfterTranspose
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:355
static constexpr auto I1
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:336
static constexpr auto I0
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:335
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::IsSupportedArgument
static bool IsSupportedArgument(const Argument &arg)
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1485
decltype(GridwiseGemmCTranspose64::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock( DsGridDesc_M_N{})) DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:521
GridwiseElementwise< Tuple< NHWGCTransposeDescType >, Tuple< NGCHWTransposeDescType >, Tuple< const EDataType * >, Tuple< EDataType * >, Block2TileMapInOutElementwise, element_wise::PassThrough, ElementwiseBlocksize, NPerBlock, MPerBlock, NPerBlock/ClusterLengthNPerBlock, MPerBlock/ClusterLengthMPerBlock, Sequence< 1, 0 >, Sequence< CDEBlockTransferScalarPerVector_NPerBlock >, Sequence< TransposeTransferOutScalarPerVectorAligned >, I0, I1 > GridwiseElementwiseOutputTranspose
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:647
decltype(GridwiseGemmCTranspose64::MakeDefaultBlock2ETileMap(EGridDesc_M_N{})) Block2ETileMap
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:528
static auto transform_k0_m_k1_to_m_k(const Desc_K0_M_K1 &desc_k0_m_k1)
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:496
GridwiseGemmBase< math::max(NXdlPerWave64, 1)> GridwiseGemm64
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:477
BlockToCTileMap_M00_N0_M01Adapt< MPerBlock, NPerBlock > Block2TileMapWeiElementwise
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:576
std::conditional_t< is_NGCHW_GKCYX_NGKHW< ELayout, BLayout, ALayout >() &&NeedTransposeKernel, tensor_layout::convolution::GKYXC, std::conditional_t< is_NGCDHW_GKCZYX_NGKDHW< ELayout, BLayout, ALayout >() && NeedTransposeKernel, tensor_layout::convolution::GKZYXC, BLayout > > BLayoutAfterTranspose
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:361
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::AGridDesc_AK0_M_AK1
remove_cvref_t< tuple_element_t< 0, ABDsEGridDesc > > AGridDesc_AK0_M_AK1
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:513
DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 DeviceOp
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:321
static auto GetDummyABDsEGridDescriptor(const ConvToGemmBwdDataTransform &conv_to_gemm_transform)
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:395
static constexpr index_t ClusterLengthMPerBlock
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:578
static auto MakeInvoker()
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1818
static GET_NXDL_PER_WAVE_IMPL constexpr auto NXdlPerWave64
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:323
static constexpr bool isATensorColMajor
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:340
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::MakeArgumentPointer
std::unique_ptr< BaseArgument > MakeArgumentPointer(const void *p_a, const void *p_b, const std::array< const void *, NumDTensor > &p_ds, void *p_e, const std::array< index_t, NDimSpatial+3 > &a_g_n_k_wos_lengths, const std::array< index_t, NDimSpatial+3 > &a_g_n_k_wos_strides, const std::array< index_t, NDimSpatial+3 > &b_g_k_c_xs_lengths, const std::array< index_t, NDimSpatial+3 > &b_g_k_c_xs_strides, const std::array< std::array< index_t, NDimSpatial+3 >, NumDTensor > &ds_g_n_c_wis_lengths, const std::array< std::array< index_t, NDimSpatial+3 >, NumDTensor > &ds_g_n_c_wis_strides, const std::array< index_t, NDimSpatial+3 > &e_g_n_c_wis_lengths, const std::array< index_t, NDimSpatial+3 > &e_g_n_c_wis_strides, const std::array< index_t, NDimSpatial > &conv_filter_strides, const std::array< index_t, NDimSpatial > &conv_filter_dilations, const std::array< index_t, NDimSpatial > &input_left_pads, const std::array< index_t, NDimSpatial > &input_right_pads, const AElementwiseOp &a_element_op, const BElementwiseOp &b_element_op, const CDEElementwiseOp &cde_element_op, const ck::index_t split_k=1) override
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1820
GridwiseGemmBase< NXdlPerWave32 > GridwiseGemm32
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:478
std::string GetTypeString() const override
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1871
TransformConvBwdDataToGemm_v1< NDimSpatial, ConvBackwardDataSpecialization, AK1, BK1, MPerBlock, NPerBlock, KPerBlock, DoPadGemmM, DoPadGemmN, ALayoutAfterTranspose, BLayoutAfterTranspose, ELayoutAfterTranspose, true, ABDataType, EDataType, 1, index_t, CTranspose > ConvToGemmBwdDataTransform
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:375
decltype(MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(EGridDesc_M_N{})) EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:524
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::SetWorkSpacePointer
void SetWorkSpacePointer(BaseArgument *p_arg, void *p_workspace, const StreamConfig &=StreamConfig{}) const override
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:1920
GridwiseElementwise< Tuple< NGCHWTransposeDescType >, Tuple< NHWGCTransposeDescType >, Tuple< const ADataType * >, Tuple< ADataType * >, Block2TileMapInOutElementwise, element_wise::PassThrough, ElementwiseBlocksize, NPerBlock, MPerBlock, NPerBlock/ClusterLengthNPerBlock, MPerBlock/ClusterLengthMPerBlock, Sequence< 1, 0 >, Sequence< TransposeTransferInScalarPerVectorAligned >, Sequence< CDEBlockTransferScalarPerVector_NPerBlock >, I1, I0 > GridwiseElementwiseInputTranspose
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:611
GridwiseGemmMultipleD_xdl_cshuffle< GridwiseGemmMultiDTemplateParams > GridwiseGemmBase
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:473
OffsettedBlockToCTileMap< Block2ETileMap > GroupedGemmBlock2ETileMap
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:531
GridwiseElementwise< Tuple< GKCYXTransposeDescType >, Tuple< GKYXCTransposeDescType >, Tuple< const BDataType * >, Tuple< BDataType * >, Block2TileMapWeiElementwise, element_wise::PassThrough, ElementwiseBlocksize, MPerBlock, NPerBlock, MPerBlock/ClusterLengthMPerBlock, NPerBlock/ClusterLengthNPerBlock, Sequence< 1, 0 >, Sequence< 1 >, Sequence< CDEBlockTransferScalarPerVector_NPerBlock >, I0, I1 > GridwiseElementwiseWeightTranspose
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:629
remove_cvref_t< decltype(conv_ngchw_to_nhwgc_transformer .template MakeNHWGCTransposeDesc< NDimSpatial >({}, {}))> NHWGCTransposeDescType
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:599
static auto MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(const EGridDesc_M_N e_grid_desc_m_n)
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:489
std::conditional_t< CTranspose, GridwiseGemmCTransposeBase< NXdlPerWave32 >, GridwiseGemm32 > GridwiseGemmCTranspose32
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:484
static constexpr index_t MaxGroupedGemmGroupsNum
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:315
static constexpr bool CTranspose
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:351
BlockToCTileMap_M00_N0_M01Adapt< NPerBlock, MPerBlock > Block2TileMapInOutElementwise
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:575
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1::BGridDesc_BK0_N_BK1
remove_cvref_t< tuple_element_t< 1, ABDsEGridDesc > > BGridDesc_BK0_N_BK1
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:514
remove_cvref_t< decltype(conv_ngchw_to_nhwgc_transformer .template MakeGKCYXTransposeDesc< NDimSpatial >({}, {}))> GKCYXTransposeDescType
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:602
static constexpr index_t NumDTensor
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:326
std::conditional_t< CTranspose, GridwiseGemmCTransposeBase< math::max(NXdlPerWave64, 1)>, GridwiseGemm64 > GridwiseGemmCTranspose64
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:480
static constexpr auto I2
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:337
static constexpr index_t TransposeTransferInScalarPerVectorAligned
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:591
static constexpr bool IsSplitKSupported
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:328
decltype(GetDummyABDsEGridDescriptor(dummy_conv_to_gemm_transform)) ABDsEGridDesc
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:511
decltype(transform_k0_m_k1_to_m_k(BGridDesc_BK0_N_BK1{})) BGridDesc_N_K
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:519
static constexpr GemmSpecialization GemmSpec
Definition device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp:327
Definition device_grouped_conv_bwd_data_multiple_d.hpp:36
Definition tensor_operation/gpu/element/unary_element_wise_operation.hpp:340