threadwise_tensor_slice_transfer_v5r1.hpp Source File

threadwise_tensor_slice_transfer_v5r1.hpp Source File#

Composable Kernel: threadwise_tensor_slice_transfer_v5r1.hpp Source File
threadwise_tensor_slice_transfer_v5r1.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
10
11namespace ck {
12
13// Assume:
14// 1. src_desc and dst_desc are not known at compile-time
15// 2. SrcBuffer and DstBuffer are DynamicBuffer
16// 3. src_slice_origin and dst_slice_origin are not known at compile-time,
17// 4. Use thread buffer
18template <typename SliceLengths,
20 typename SrcData,
21 typename DstData,
22 typename SrcDesc,
23 typename DstDesc,
24 typename SrcDimAccessOrder,
25 typename DstDimAccessOrder,
26 typename SrcVectorTensorLengths,
27 typename DstVectorTensorLengths,
28 typename SrcVectorTensorContiguousDimOrder,
29 typename DstVectorTensorContiguousDimOrder,
30 bool SrcResetCoordinateAfterRun, // control whether to move back src coordinate after each
31 // RunRead(), will be fused with MoveSrcSliceWindow to
32 // save addr computation
33 bool DstResetCoordinateAfterRun> // control whether to move back dst coordinate after each
34 // RunWrite(), will be fused with MoveDstSliceWindow to
35 // save addr computation
37{
38 static constexpr auto I0 = Number<0>{};
39 static constexpr auto I1 = Number<1>{};
40
41 static constexpr index_t nDim = SliceLengths::Size();
43
44 using SrcCoord = decltype(make_tensor_coordinate(SrcDesc{}, Index{}));
45 using DstCoord = decltype(make_tensor_coordinate(DstDesc{}, Index{}));
46
47 using SrcCoordStep = decltype(make_tensor_coordinate_step(SrcDesc{}, Index{}));
48 using DstCoordStep = decltype(make_tensor_coordinate_step(DstDesc{}, Index{}));
49
50 __device__ constexpr ThreadwiseTensorSliceTransfer_v5r1(const SrcDesc& src_desc,
51 const Index& src_slice_origin,
52 const DstDesc& dst_desc,
53 const Index& dst_slice_origin)
54 : src_coord_(make_tensor_coordinate(src_desc, src_slice_origin)),
55 dst_coord_(make_tensor_coordinate(dst_desc, dst_slice_origin))
56 {
57 // TODO: fix this
59 "wrong! current implementation assume SrcData and DstData are same type");
60
61 static_for<0, nDim, 1>{}([](auto i) {
62 static_assert(SliceLengths::At(i) % SrcVectorTensorLengths::At(i) == 0 &&
63 SliceLengths::At(i) % DstVectorTensorLengths::At(i) == 0,
64 "wrong!");
65 });
66 }
67
68 __device__ void SetSrcSliceOrigin(const SrcDesc& src_desc, const Index& src_slice_origin_idx)
69 {
70 src_coord_ = make_tensor_coordinate(src_desc, src_slice_origin_idx);
71 }
72
73 __device__ void SetDstSliceOrigin(const DstDesc& dst_desc, const Index& dst_slice_origin_idx)
74 {
75 dst_coord_ = make_tensor_coordinate(dst_desc, dst_slice_origin_idx);
76 }
77
78 template <typename SrcBuffer, typename SrcStepHacks>
79 __device__ void
80 RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf, const SrcStepHacks& src_step_hacks)
81 {
82 static_assert(SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
83 SrcBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
84 "wrong!");
85
86 static_assert(
88 "wrong! SrcBuffer and SrcData data type are inconsistent");
89
90 // tensor descriptor for src_vector
91 constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
92
93 constexpr auto src_vector_tensor_strides = container_reorder_given_old2new(
95 container_reorder_given_new2old(src_vector_tensor_lengths,
96 SrcVectorTensorContiguousDimOrder{}),
98 I1),
99 SrcVectorTensorContiguousDimOrder{});
100
101 constexpr auto src_vector_desc =
103 sequence_to_tuple_of_number(src_vector_tensor_strides));
104
105 // access order and lengths
106 constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
107
108 constexpr auto src_dim_access_order = SrcDimAccessOrder{};
109
110 constexpr auto ordered_src_access_lengths =
111 container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
112
113 // make forward steps
114 const auto src_forward_steps = generate_tuple(
115 [&](auto i) {
116 Index forward_step_idx;
117
118 static_for<0, nDim, 1>{}([&](auto j) {
119 forward_step_idx(j) = (i.value == j.value) ? src_vector_tensor_lengths[i] : 0;
120 });
121
123 src_desc, forward_step_idx, src_step_hacks[I0][i]);
124 },
125 Number<nDim>{});
126
127 // make backward steps
128 const auto src_backward_steps = generate_tuple(
129 [&](auto i) {
130 Index backward_step_idx;
131
132 static_for<0, nDim, 1>{}([&](auto j) {
133 backward_step_idx(j) = (i.value == j.value) ? -src_vector_tensor_lengths[i] : 0;
134 });
135
137 src_desc, backward_step_idx, src_step_hacks[I1][i]);
138 },
139 Number<nDim>{});
140
141 // loop over tensor and copy
142 static_ford<decltype(ordered_src_access_lengths)>{}([&](auto ordered_src_access_idx) {
143 // judge move forward or move backward
144 constexpr auto forward_sweep = [&]() {
146
147 forward_sweep_(I0) = true;
148
149 static_for<1, nDim, 1>{}([&](auto i) {
150 index_t tmp = ordered_src_access_idx[I0];
151
152 static_for<0, i, 1>{}([&](auto j) {
153 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_idx[j];
154 });
155
156 forward_sweep_(i) = tmp % 2 == 0;
157 });
158
159 return forward_sweep_;
160 }();
161
162 // calculate src data index
163 constexpr auto src_data_idx = [&]() {
164 Index ordered_idx;
165
166 static_for<0, nDim, 1>{}([&](auto i) {
167 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_idx[i]
168 : ordered_src_access_lengths[i] - 1 -
169 ordered_src_access_idx[i];
170 });
171
172 return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
173 src_vector_tensor_lengths;
174 }();
175
176 vector_type_maker_t<SrcData, src_vector_desc.GetElementSpaceSize()> src_vector;
177
178 using src_vector_t = typename decltype(src_vector)::type;
179
180 const bool is_src_valid =
182
183 // copy data from src_buf to src_vector
184 src_vector.template AsType<src_vector_t>()(I0) =
185 src_buf.template Get<src_vector_t>(src_coord_.GetOffset(), is_src_valid);
186
187 // copy data from src_vector to buffer_
188 static_ford<SrcVectorTensorLengths>{}([&](auto src_vector_idx_) {
189 constexpr auto src_vector_idx = to_multi_index(src_vector_idx_);
190
191 constexpr index_t src_vector_offset =
192 src_vector_desc.CalculateOffset(src_vector_idx);
193
194 constexpr index_t buffer_offset =
195 buffer_desc_.CalculateOffset(src_data_idx + src_vector_idx);
196
197 buffer_(Number<buffer_offset>{}) =
198 src_vector.template AsType<SrcData>()[Number<src_vector_offset>{}];
199 });
200
201 constexpr auto move_on_dim = [&]() constexpr {
203
204 static_for<0, nDim, 1>{}([&](auto i) {
205 move_on_dim_(i) = ordered_src_access_idx[i] < ordered_src_access_lengths[i] - 1;
206
207 static_for<i + 1, nDim, 1>{}([&](auto j) {
208 move_on_dim_(i) &=
209 ordered_src_access_idx[j] == ordered_src_access_lengths[j] - 1;
210 });
211 });
212
213 return move_on_dim_;
214 }();
215
216 // move
217 static_for<0, nDim, 1>{}([&](auto i) {
218 if constexpr(move_on_dim[i])
219 {
220 if constexpr(forward_sweep[i])
221 {
223 src_desc, src_coord_, src_forward_steps[src_dim_access_order[i]]);
224 }
225 else
226 {
228 src_desc, src_coord_, src_backward_steps[src_dim_access_order[i]]);
229 }
230 }
231 });
232 });
233
234 // move src coordinate back to slice origin (or not)
235 if constexpr(SrcResetCoordinateAfterRun)
236 {
237 const auto src_reset_step =
239
240 move_tensor_coordinate(src_desc, src_coord_, src_reset_step);
241 }
242 }
243
244 template <typename DstBuffer, typename DstStepHacks>
245 __device__ void
246 RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf, const DstStepHacks& dst_step_hacks)
247 {
248 static_assert(DstBuffer::GetAddressSpace() == AddressSpaceEnum::Global or
249 DstBuffer::GetAddressSpace() == AddressSpaceEnum::Lds,
250 "wrong!");
251
252 static_assert(
254 "wrong! SrcBuffer or DstBuffer data type is wrong");
255
256 // tensor descriptor for dst_vector
257 constexpr auto dst_vector_tensor_lengths = DstVectorTensorLengths{};
258
259 constexpr auto dst_vector_tensor_strides = container_reorder_given_old2new(
261 container_reorder_given_new2old(dst_vector_tensor_lengths,
262 DstVectorTensorContiguousDimOrder{}),
264 I1),
265 DstVectorTensorContiguousDimOrder{});
266
267 constexpr auto dst_vector_desc =
269 sequence_to_tuple_of_number(dst_vector_tensor_strides));
270
271 // dst access order and lengths
272 constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
273
274 constexpr auto dst_dim_access_order = DstDimAccessOrder{};
275
276 constexpr auto ordered_dst_access_lengths =
277 container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
278
279 // make forward steps
280 const auto dst_forward_steps = generate_tuple(
281 [&](auto i) {
282 Index forward_step_idx;
283
284 static_for<0, nDim, 1>{}([&](auto j) {
285 forward_step_idx(j) = (i.value == j.value) ? dst_vector_tensor_lengths[i] : 0;
286 });
287
289 dst_desc, forward_step_idx, dst_step_hacks[I0][i]);
290 },
291 Number<nDim>{});
292
293 // make backward steps
294 const auto dst_backward_steps = generate_tuple(
295 [&](auto i) {
296 Index backward_step_idx;
297
298 static_for<0, nDim, 1>{}([&](auto j) {
299 backward_step_idx(j) = (i.value == j.value) ? -dst_vector_tensor_lengths[i] : 0;
300 });
301
303 dst_desc, backward_step_idx, dst_step_hacks[I1][i]);
304 },
305 Number<nDim>{});
306
307 // loop over tensor and copy
308 static_ford<decltype(ordered_dst_access_lengths)>{}([&](auto ordered_dst_access_idx) {
309 // judge move forward or move backward
310 constexpr auto forward_sweep = [&]() {
312
313 forward_sweep_(I0) = true;
314
315 static_for<1, nDim, 1>{}([&](auto i) {
316 index_t tmp = 0;
317
318 static_for<0, i, 1>{}([&](auto j) {
319 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_idx[j];
320 });
321
322 forward_sweep_(i) = tmp % 2 == 0;
323 });
324
325 return forward_sweep_;
326 }();
327
328 // calculate dst data index
329 constexpr auto dst_data_idx = [&]() {
330 Index ordered_idx;
331
332 static_for<0, nDim, 1>{}([&](auto i) {
333 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_idx[i]
334 : ordered_dst_access_lengths[i] - 1 -
335 ordered_dst_access_idx[i];
336 });
337
338 return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
339 dst_vector_tensor_lengths;
340 }();
341
342 vector_type_maker_t<DstData, dst_vector_desc.GetElementSpaceSize()> dst_vector;
343
344 // copy data from buffer_ to dst_vector (also cast from SrcData to DstData)
345 static_ford<DstVectorTensorLengths>{}([&](auto dst_vector_idx_) {
346 constexpr auto dst_vector_idx = to_multi_index(dst_vector_idx_);
347
348 constexpr index_t buffer_offset =
349 buffer_desc_.CalculateOffset(dst_data_idx + dst_vector_idx);
350
351 constexpr index_t dst_vector_offset =
352 dst_vector_desc.CalculateOffset(dst_vector_idx);
353
354 dst_vector.template AsType<DstData>()(Number<dst_vector_offset>{}) =
356 });
357
358 using dst_vector_t = typename decltype(dst_vector)::type;
359
360 // copy data from dst_vector to dst_buf
361 const bool is_dst_valid =
363
364 dst_buf.template Set<dst_vector_t>(
365 dst_coord_.GetOffset(),
366 is_dst_valid,
367 dst_vector.template AsType<dst_vector_t>()[Number<0>{}]);
368
369 constexpr auto move_on_dim = [&]() constexpr {
371
372 static_for<0, nDim, 1>{}([&](auto i) {
373 move_on_dim_(i) = ordered_dst_access_idx[i] < ordered_dst_access_lengths[i] - 1;
374
375 static_for<i + 1, nDim, 1>{}([&](auto j) {
376 move_on_dim_(i) &=
377 ordered_dst_access_idx[j] == ordered_dst_access_lengths[j] - 1;
378 });
379 });
380
381 return move_on_dim_;
382 }();
383
384 // move
385 static_for<0, nDim, 1>{}([&](auto i) {
386 if constexpr(move_on_dim[i])
387 {
388 if constexpr(forward_sweep[i])
389 {
391 dst_desc, dst_coord_, dst_forward_steps[dst_dim_access_order[i]]);
392 }
393 else
394 {
396 dst_desc, dst_coord_, dst_backward_steps[dst_dim_access_order[i]]);
397 }
398 }
399 });
400 });
401
402 // move dst coordinate back to slice origin (or not)
403 if constexpr(DstResetCoordinateAfterRun)
404 {
405 const auto dst_reset_step =
407
408 move_tensor_coordinate(dst_desc, dst_coord_, dst_reset_step);
409 }
410 }
411
412 template <typename SrcBuffer>
413 __device__ void RunRead(const SrcDesc& src_desc, const SrcBuffer& src_buf)
414 {
415 constexpr index_t ntransform_src = SrcDesc::GetNumOfTransform();
416
417 constexpr auto zeros = typename uniform_sequence_gen<ntransform_src, 0>::type{};
418
419 constexpr auto src_step_hacks =
420 make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
421 generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
422
423 RunRead(src_desc, src_buf, src_step_hacks);
424 }
425
426 template <typename DstBuffer>
427 __device__ void RunWrite(const DstDesc& dst_desc, DstBuffer& dst_buf)
428 {
429 constexpr index_t ntransform_dst = DstDesc::GetNumOfTransform();
430
431 constexpr auto zeros = typename uniform_sequence_gen<ntransform_dst, 0>::type{};
432
433 constexpr auto dst_step_hacks =
434 make_tuple(generate_tuple([&](auto) { return zeros; }, Number<nDim>{}),
435 generate_tuple([&](auto) { return zeros; }, Number<nDim>{}));
436
437 RunWrite(dst_desc, dst_buf, dst_step_hacks);
438 }
439
440 __device__ static constexpr auto GetSrcCoordinateResetStep()
441 {
442 constexpr auto src_vector_tensor_lengths = SrcVectorTensorLengths{};
443
444 constexpr auto src_access_lengths = SliceLengths{} / src_vector_tensor_lengths;
445
446 constexpr auto src_dim_access_order = SrcDimAccessOrder{};
447
448 constexpr auto ordered_src_access_lengths =
449 container_reorder_given_new2old(src_access_lengths, src_dim_access_order);
450
451 // judge move forward or move backward during the last iteration
452 constexpr auto forward_sweep = [&]() {
454
455 forward_sweep_(I0) = true;
456
457 static_for<1, nDim, 1>{}([&](auto i) {
458 index_t tmp = ordered_src_access_lengths[I0] - 1;
459
460 static_for<0, i, 1>{}([&](auto j) {
461 tmp = tmp * ordered_src_access_lengths[j] + ordered_src_access_lengths[j] - 1;
462 });
463
464 forward_sweep_(i) = tmp % 2 == 0;
465 });
466
467 return forward_sweep_;
468 }();
469
470 // calculate src data index after last iteration in RunRead(), if it has not being reset by
471 // RunRead()
472 constexpr auto src_data_idx = [&]() {
473 Index ordered_idx;
474
475 static_for<0, nDim, 1>{}([&](auto i) {
476 ordered_idx(i) = forward_sweep[i] ? ordered_src_access_lengths[i] - 1 : 0;
477 });
478
479 return container_reorder_given_old2new(ordered_idx, src_dim_access_order) *
480 src_vector_tensor_lengths;
481 }();
482
483 //
484 constexpr auto reset_src_data_step = [&]() {
485 Index reset_src_data_step_;
486
487 static_for<0, nDim, 1>{}([&](auto i) { reset_src_data_step_(i) = -src_data_idx[i]; });
488
489 return reset_src_data_step_;
490 }();
491
492 return reset_src_data_step;
493 }
494
495 __device__ static constexpr auto GetDstCoordinateResetStep()
496 {
497 constexpr auto dst_vector_tensor_lengths = DstVectorTensorLengths{};
498
499 constexpr auto dst_access_lengths = SliceLengths{} / dst_vector_tensor_lengths;
500
501 constexpr auto dst_dim_access_order = DstDimAccessOrder{};
502
503 constexpr auto ordered_dst_access_lengths =
504 container_reorder_given_new2old(dst_access_lengths, dst_dim_access_order);
505
506 // judge move forward or move backward during the last iteration
507 constexpr auto forward_sweep = [&]() {
509
510 forward_sweep_(I0) = true;
511
512 static_for<1, nDim, 1>{}([&](auto i) {
513 index_t tmp = ordered_dst_access_lengths[I0] - 1;
514
515 static_for<0, i, 1>{}([&](auto j) {
516 tmp = tmp * ordered_dst_access_lengths[j] + ordered_dst_access_lengths[j] - 1;
517 });
518
519 forward_sweep_(i) = tmp % 2 == 0;
520 });
521
522 return forward_sweep_;
523 }();
524
525 // calculate dst data index after last iteration in RunWrite(), if it has not being reset by
526 // RunWrite()
527 constexpr auto dst_data_idx = [&]() {
528 Index ordered_idx;
529
530 static_for<0, nDim, 1>{}([&](auto i) {
531 ordered_idx(i) = forward_sweep[i] ? ordered_dst_access_lengths[i] - 1 : 0;
532 });
533
534 return container_reorder_given_old2new(ordered_idx, dst_dim_access_order) *
535 dst_vector_tensor_lengths;
536 }();
537
538 //
539 constexpr auto reset_dst_data_step = [&]() {
540 Index reset_dst_data_step_;
541
542 static_for<0, nDim, 1>{}([&](auto i) { reset_dst_data_step_(i) = -dst_data_idx[i]; });
543
544 return reset_dst_data_step_;
545 }();
546
547 return reset_dst_data_step;
548 }
549
550 // src_slice_origin_step_idx need to be known at compile-time, for performance reason
551 __device__ void MoveSrcSliceWindow(const SrcDesc& src_desc,
552 const Index& src_slice_origin_step_idx)
553 {
554 // if src coord was not reset by RunRead(), then need to adjust the step here
555 const auto adjusted_step_idx =
556 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
557 : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
558
559 // is it OK to construct a new step every time?
560 const auto adjusted_step = make_tensor_coordinate_step(src_desc, adjusted_step_idx);
561
562 move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
563 }
564
565 // src_slice_origin_step_idx need to be known at compile-time, for performance reason
566 template <typename SrcMoveSliceWindowStepHack>
567 __device__ void
568 MoveSrcSliceWindow(const SrcDesc& src_desc,
569 const Index& src_slice_origin_step_idx,
570 const SrcMoveSliceWindowStepHack& src_move_slice_window_step_hack)
571 {
572 // if src coord was not reset by RunRead(), then need to adjust the step here
573 const auto adjusted_step_idx =
574 SrcResetCoordinateAfterRun ? src_slice_origin_step_idx
575 : src_slice_origin_step_idx + GetSrcCoordinateResetStep();
576
577 // is it OK to construct a new step every time?
578 const auto adjusted_step = make_tensor_coordinate_step(
579 src_desc, adjusted_step_idx, src_move_slice_window_step_hack);
580
581 move_tensor_coordinate(src_desc, src_coord_, adjusted_step);
582 }
583 // dst_slice_origin_step_idx need to be known at compile-time, for performance reason
584 __device__ void MoveDstSliceWindow(const DstDesc& dst_desc,
585 const Index& dst_slice_origin_step_idx)
586 {
587 // if dst coord was not reset by RunWrite(), then need to adjust the step here
588 const auto adjusted_step_idx =
589 DstResetCoordinateAfterRun ? dst_slice_origin_step_idx
590 : dst_slice_origin_step_idx + GetDstCoordinateResetStep();
591
592 // is it OK to construct a new step every time?
593 const auto adjusted_step = make_tensor_coordinate_step(dst_desc, adjusted_step_idx);
594
595 move_tensor_coordinate(dst_desc, dst_coord_, adjusted_step);
596 }
597
598 private:
599 static constexpr auto buffer_desc_ =
601
602 static constexpr auto buffer_size_ = buffer_desc_.GetElementSpaceSize();
603
604 StaticBuffer<AddressSpaceEnum::Vgpr, SrcData, buffer_size_, true> buffer_;
605
606 SrcCoord src_coord_;
607 DstCoord dst_coord_;
608};
609
610} // namespace ck
Definition ck.hpp:268
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_tensor_coordinate_step(const TensorDesc &, const VisibleIndex &idx_diff_visible, UpdateLowerIndexHack)
Definition tensor_description/tensor_descriptor.hpp:444
__host__ __device__ constexpr void move_tensor_coordinate(const TensorDesc &tensor_desc, TensorCoord &coord, const TensorCoordStep &coord_step)
Definition tensor_description/tensor_descriptor.hpp:508
__host__ __device__ constexpr auto make_naive_tensor_descriptor(const Tuple< Lengths... > &lengths, const Tuple< Strides... > &strides)
Definition tensor_descriptor_helper.hpp:49
InMemoryDataOperationEnum
Definition ck.hpp:277
@ Set
Definition ck.hpp:278
remove_cv_t< remove_reference_t< T > > remove_cvref_t
Definition type.hpp:297
__host__ __device__ constexpr bool coordinate_has_valid_offset_assuming_visible_index_is_valid(const TensorDesc &tensor_desc, const TensorCoord &coord)
Definition tensor_description/tensor_descriptor.hpp:560
integral_constant< index_t, N > Number
Definition number.hpp:12
@ Lds
Definition amd_address_space.hpp:18
@ Global
Definition amd_address_space.hpp:17
__host__ __device__ constexpr auto sequence_to_tuple_of_number(Sequence< Is... >)
Definition utility/container_helper.hpp:380
__host__ __device__ constexpr Y type_convert(X x)
Definition utility/type_convert.hpp:98
__host__ __device__ constexpr auto container_reorder_given_old2new(const Array< TData, NSize > &old_array, Sequence< IRs... > old2new)
Definition utility/container_helper.hpp:54
__host__ __device__ constexpr auto to_multi_index(const T &x)
Definition array_multi_index.hpp:28
__host__ __device__ constexpr auto generate_tuple(F &&f, Number< N >)
Definition tuple_helper.hpp:21
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto container_reverse_exclusive_scan(const Array< TData, NSize > &x, Reduce f, TData init)
Definition utility/container_helper.hpp:213
__host__ __device__ constexpr auto make_tensor_coordinate(const TensorDesc &tensor_desc, const VisibleIndex &idx_visible)
Definition tensor_description/tensor_descriptor.hpp:407
__host__ __device__ constexpr auto container_reorder_given_new2old(const Array< TData, NSize > &old_array, Sequence< IRs... >)
Definition utility/container_helper.hpp:43
Array< index_t, N > MultiIndex
Definition array_multi_index.hpp:12
typename vector_type_maker< T, N >::type vector_type_maker_t
Definition dtype_vector.hpp:54
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:413
__device__ void MoveDstSliceWindow(const DstDesc &dst_desc, const Index &dst_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:584
__device__ void SetDstSliceOrigin(const DstDesc &dst_desc, const Index &dst_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:73
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:551
__device__ void SetSrcSliceOrigin(const SrcDesc &src_desc, const Index &src_slice_origin_idx)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:68
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf, const DstStepHacks &dst_step_hacks)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:246
__device__ void MoveSrcSliceWindow(const SrcDesc &src_desc, const Index &src_slice_origin_step_idx, const SrcMoveSliceWindowStepHack &src_move_slice_window_step_hack)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:568
static __device__ constexpr auto GetSrcCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v5r1.hpp:440
__device__ constexpr ThreadwiseTensorSliceTransfer_v5r1(const SrcDesc &src_desc, const Index &src_slice_origin, const DstDesc &dst_desc, const Index &dst_slice_origin)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:50
__device__ void RunWrite(const DstDesc &dst_desc, DstBuffer &dst_buf)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:427
static __device__ constexpr auto GetDstCoordinateResetStep()
Definition threadwise_tensor_slice_transfer_v5r1.hpp:495
__device__ void RunRead(const SrcDesc &src_desc, const SrcBuffer &src_buf, const SrcStepHacks &src_step_hacks)
Definition threadwise_tensor_slice_transfer_v5r1.hpp:80
static constexpr value_type value
Definition utility/integral_constant.hpp:13
Definition type.hpp:177
Definition utility/math.hpp:34
Definition functional2.hpp:33
Definition functional3.hpp:97
typename sequence_gen< NSize, F >::type type
Definition utility/sequence.hpp:295