amd_buffer_addressing_builtins.hpp Source File

amd_buffer_addressing_builtins.hpp Source File#

Composable Kernel: amd_buffer_addressing_builtins.hpp Source File
tile/core/arch/amd_buffer_addressing_builtins.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
7
8#if CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
9
19
20using as3_uint32_ptr = uint32_t __attribute__((address_space(3)))*;
21
22namespace ck_tile {
23
24// amd_wave_read_first_lane is the SGPR function from AMD GPU device to load 1 or a series of the
25// memory to the SGPR registers.
27{
28 return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
29}
30
31__device__ inline uint32_t amd_wave_read_first_lane(uint8_t v)
32{
33 return __builtin_amdgcn_readfirstlane(static_cast<uint32_t>(v));
34}
35
37{
38 return __builtin_amdgcn_readfirstlane(value);
39}
40
42{
43 return __builtin_amdgcn_readfirstlane(value);
44}
45
46template <typename Object, std::enable_if_t<std::is_trivially_copyable_v<Object>, int> = 0>
47__device__ inline auto amd_wave_read_first_lane(const Object& obj)
48{
49 constexpr size_t ObjectSize = sizeof(Object);
50 constexpr size_t SGPR_size = 4;
51 constexpr size_t NumFull = ObjectSize / SGPR_size;
52 constexpr size_t Tail = ObjectSize % SGPR_size;
53
54 const unsigned char* src = reinterpret_cast<const unsigned char*>(&obj);
55 alignas(Object) unsigned char dst[ObjectSize];
56
57 static_for<0, NumFull, 1>{}([&](auto Ic) {
58 constexpr size_t offset = Ic * SGPR_size;
59 uint32_t read_src;
60 __builtin_memcpy(&read_src, src + offset, SGPR_size);
61 read_src = __builtin_amdgcn_readfirstlane(read_src);
62 __builtin_memcpy(dst + offset, &read_src, SGPR_size);
63 });
64
65 if constexpr(Tail != 0)
66 {
67 constexpr size_t offset = NumFull * SGPR_size;
68 uint32_t tail_loc = 0;
69 __builtin_memcpy(&tail_loc, src + offset, Tail);
70 tail_loc = __builtin_amdgcn_readfirstlane(tail_loc);
71 __builtin_memcpy(dst + offset, &tail_loc, Tail);
72 }
73 Object out;
74 __builtin_memcpy(&out, dst, ObjectSize);
75 return out;
76}
77
78// 128 bit SGPRs to supply buffer resource in buffer instructions
79// https://rocm-documentation.readthedocs.io/en/latest/GCN_ISA_Manuals/testdocbook.html#vector-memory-buffer-instructions
80struct __attribute__((packed)) buffer_resource
81{
82 const void* ptr;
83 uint32_t range;
84 uint32_t config;
85};
86
87template <typename ForceSGPR = std::false_type>
89 uint32_t size = 0xffffffff,
90 ForceSGPR = {})
91{
93 int32x4_t r = __builtin_bit_cast(int32x4_t, res);
94 if constexpr(std::is_same_v<ForceSGPR, std::true_type>)
95 {
97 }
98 return r;
99}
100
101namespace impl {
102// below type indicate the data type used for buffer load inline asm
103// clang-format off
104template<index_t N, typename T> struct buffer_load_trait;
105
106template<typename T> struct buffer_load_trait<16, T> { using payload_t = fp32x4_t; };
107template<typename T> struct buffer_load_trait<8 , T> { using payload_t = fp32x2_t; };
108template<typename T> struct buffer_load_trait<4 , T> { using payload_t = float; };
109template<typename T> struct buffer_load_trait<2 , T> { using payload_t = float; };
110template<typename T> struct buffer_load_trait<1 , T> { using payload_t = float; };
111
112#if CK_TILE_BUFFER_LOAD_RAW_BF16_WA
113template<> struct buffer_load_trait<16, thread_buffer<bf16_t, 8>> { using payload_t = bf16x8_t; };
114template<> struct buffer_load_trait<8 , thread_buffer<bf16_t, 4>> { using payload_t = bf16x4_t; };
115template<> struct buffer_load_trait<4 , thread_buffer<bf16_t, 2>> { using payload_t = bf16x2_t; };
116#endif
117// clang-format on
118} // namespace impl
119
120// TODO: glc/slc/...
121template <index_t bytes, bool pre_nop = false>
122struct buffer_load;
123#pragma clang diagnostic push
124#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
125// TODO: strict aliasing rule seems fail when reinterpret_cast between vector type
126// (exp_vector_type(xxx))
127template <bool pre_nop>
128struct buffer_load<16, pre_nop>
129{
130 template <typename T>
131 CK_TILE_DEVICE void operator()(T& value,
132 int32x4_t res /*buffer resource*/,
133 index_t v_offset,
134 index_t /*s_offset*/,
135 index_t i_offset /*max 0xFFF*/,
136 index_t /*flag*/ = 0,
138 {
139 static_assert(sizeof(T) == 16);
140 using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
141 if constexpr(pre_nop)
142 asm volatile("s_nop 4\n"
143 "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
144 : "+v"(reinterpret_cast<mbuf_t&>(value))
145 : "v"(v_offset), "s"(res), "n"(i_offset)
146 : "memory");
147 else
148 asm volatile("buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
149 : "+v"(reinterpret_cast<mbuf_t&>(value))
150 : "v"(v_offset), "s"(res), "n"(i_offset)
151 : "memory");
152 }
153};
154
155template <bool pre_nop>
156struct buffer_load<8, pre_nop>
157{
158 template <typename T>
159 CK_TILE_DEVICE void operator()(T& value,
160 int32x4_t res /*buffer resource*/,
161 index_t v_offset,
162 index_t /*s_offset*/,
163 index_t i_offset /*max 0xFFF*/,
164 index_t /*flag*/ = 0,
166 {
167 static_assert(sizeof(T) == 8);
168 using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
169 if constexpr(pre_nop)
170 asm volatile("s_nop 4\n"
171 "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
172 : "+v"(reinterpret_cast<mbuf_t&>(value))
173 : "v"(v_offset), "s"(res), "n"(i_offset)
174 : "memory");
175 else
176 asm volatile("buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
177 : "+v"(reinterpret_cast<mbuf_t&>(value))
178 : "v"(v_offset), "s"(res), "n"(i_offset)
179 : "memory");
180 }
181};
182
183template <bool pre_nop>
184struct buffer_load<4, pre_nop>
185{
186 template <typename T>
187 CK_TILE_DEVICE void operator()(T& value,
188 int32x4_t res /*buffer resource*/,
189 index_t v_offset,
190 index_t /*s_offset*/,
191 index_t i_offset /*max 0xFFF*/,
192 index_t /*flag*/ = 0,
194 {
195 static_assert(sizeof(T) == 4);
196 using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
197 if constexpr(pre_nop)
198 asm volatile("s_nop 4\n"
199 "buffer_load_dword %0, %1, %2, 0 offen offset:%3"
200 : "+v"(reinterpret_cast<mbuf_t&>(value))
201 : "v"(v_offset), "s"(res), "n"(i_offset)
202 : "memory");
203 else
204 asm volatile("buffer_load_dword %0, %1, %2, 0 offen offset:%3"
205 : "+v"(reinterpret_cast<mbuf_t&>(value))
206 : "v"(v_offset), "s"(res), "n"(i_offset)
207 : "memory");
208 }
209};
210
211template <bool pre_nop>
212struct buffer_load<2, pre_nop>
213{
214 template <typename T>
215 CK_TILE_DEVICE void operator()(T& value,
216 int32x4_t res /*buffer resource*/,
217 index_t v_offset,
218 index_t /*s_offset*/,
219 index_t i_offset /*max 0xFFF*/,
220 index_t /*flag*/ = 0,
222 {
223 static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
224 using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
225 if constexpr(pre_nop)
226 asm volatile("s_nop 4\n"
227 "buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
228 : "+v"(reinterpret_cast<mbuf_t&>(value))
229 : "v"(v_offset), "s"(res), "n"(i_offset)
230 : "memory");
231 else
232 asm volatile("buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
233 : "+v"(reinterpret_cast<mbuf_t&>(value))
234 : "v"(v_offset), "s"(res), "n"(i_offset)
235 : "memory");
236 }
237};
238
239template <bool pre_nop>
240struct buffer_load<1, pre_nop>
241{
242 template <typename T>
243 CK_TILE_DEVICE void operator()(T& value,
244 int32x4_t res /*buffer resource*/,
245 index_t v_offset,
246 index_t /*s_offset*/,
247 index_t i_offset /*max 0xFFF*/,
248 index_t /*flag*/ = 0,
250 {
251 static_assert(sizeof(T) == 4);
252 using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
253 if constexpr(pre_nop)
254 asm volatile("s_nop 4\n"
255 "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
256 : "+v"(reinterpret_cast<mbuf_t&>(value))
257 : "v"(v_offset), "s"(res), "n"(i_offset)
258 : "memory");
259 else
260 asm volatile("buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
261 : "+v"(reinterpret_cast<mbuf_t&>(value))
262 : "v"(v_offset), "s"(res), "n"(i_offset)
263 : "memory");
264 }
265};
266
267template <index_t bytes, bool pre_nop = false>
268struct buffer_load_if;
269
270template <bool pre_nop>
271struct buffer_load_if<16, pre_nop>
272{
273 template <typename T>
274 CK_TILE_DEVICE void operator()(T& value,
275 int32x4_t res /*buffer resource*/,
276 index_t v_offset,
277 index_t /*s_offset*/,
278 index_t i_offset /*max 0xFFF*/,
279 index_t flag = 0,
281 {
282 static_assert(sizeof(T) == 16);
283 auto saved_exec = __builtin_amdgcn_read_exec();
284 using mbuf_t = typename impl::buffer_load_trait<16, T>::payload_t;
285 static_assert(sizeof(mbuf_t) == sizeof(T));
286 if constexpr(pre_nop)
287 asm volatile("s_nop 4\n"
288 "v_cmpx_le_u32 exec, 1, %4\n"
289 "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
290 "s_mov_b64 exec %5"
291 : "+v"(reinterpret_cast<mbuf_t&>(value))
292 : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
293 : "memory");
294 else
295 asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
296 "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
297 "s_mov_b64 exec %5"
298 : "+v"(reinterpret_cast<mbuf_t&>(value))
299 : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
300 : "memory");
301 }
302};
303
304template <bool pre_nop>
305struct buffer_load_if<8, pre_nop>
306{
307 template <typename T>
308 CK_TILE_DEVICE void operator()(T& value,
309 int32x4_t res /*buffer resource*/,
310 index_t v_offset,
311 index_t /*s_offset*/,
312 index_t i_offset /*max 0xFFF*/,
313 index_t flag = 0,
315 {
316 static_assert(sizeof(T) == 8);
317 auto saved_exec = __builtin_amdgcn_read_exec();
318 using mbuf_t = typename impl::buffer_load_trait<8, T>::payload_t;
319 if constexpr(pre_nop)
320 asm volatile("s_nop 4\n"
321 "v_cmpx_le_u32 exec, 1, %4\n"
322 "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
323 "s_mov_b64 exec %5"
324 : "+v"(reinterpret_cast<mbuf_t&>(value))
325 : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
326 : "memory");
327 else
328 asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
329 "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
330 "s_mov_b64 exec %5"
331 : "+v"(reinterpret_cast<mbuf_t&>(value))
332 : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
333 : "memory");
334 }
335};
336
337template <bool pre_nop>
338struct buffer_load_if<4, pre_nop>
339{
340 template <typename T>
341 CK_TILE_DEVICE void operator()(T& value,
342 int32x4_t res /*buffer resource*/,
343 index_t v_offset,
344 index_t /*s_offset*/,
345 index_t i_offset /*max 0xFFF*/,
346 index_t flag = 0,
348 {
349 static_assert(sizeof(T) == 4);
350 auto saved_exec = __builtin_amdgcn_read_exec();
351 using mbuf_t = typename impl::buffer_load_trait<4, T>::payload_t;
352 if constexpr(pre_nop)
353 asm volatile("s_nop 4\n"
354 "v_cmpx_le_u32 exec, 1, %4\n"
355 "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
356 "s_mov_b64 exec %5"
357 : "+v"(reinterpret_cast<mbuf_t&>(value))
358 : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
359 : "memory");
360 else
361 asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
362 "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
363 "s_mov_b64 exec %5"
364 : "+v"(reinterpret_cast<mbuf_t&>(value))
365 : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
366 : "memory");
367 }
368};
369
370template <bool pre_nop>
371struct buffer_load_if<2, pre_nop>
372{
373 template <typename T>
374 CK_TILE_DEVICE void operator()(T& value,
375 int32x4_t res /*buffer resource*/,
376 index_t v_offset,
377 index_t /*s_offset*/,
378 index_t i_offset /*max 0xFFF*/,
379 index_t flag = 0,
381 {
382 static_assert(sizeof(T) == 4);
383 auto saved_exec = __builtin_amdgcn_read_exec();
384 using mbuf_t = typename impl::buffer_load_trait<2, T>::payload_t;
385 if constexpr(pre_nop)
386 asm volatile("s_nop 4\n"
387 "v_cmpx_le_u32 exec, 1, %4\n"
388 "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
389 "s_mov_b64 exec %5"
390 : "+v"(reinterpret_cast<mbuf_t&>(value))
391 : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
392 : "memory");
393 else
394 asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
395 "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
396 "s_mov_b64 exec %5"
397 : "+v"(reinterpret_cast<mbuf_t&>(value))
398 : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
399 : "memory");
400 }
401};
402
403template <bool pre_nop>
404struct buffer_load_if<1, pre_nop>
405{
406 template <typename T>
407 CK_TILE_DEVICE void operator()(T& value,
408 int32x4_t res /*buffer resource*/,
409 index_t v_offset,
410 index_t /*s_offset*/,
411 index_t i_offset /*max 0xFFF*/,
412 index_t flag = 0,
414 {
415 static_assert(sizeof(T) == 4);
416 auto saved_exec = __builtin_amdgcn_read_exec();
417 using mbuf_t = typename impl::buffer_load_trait<1, T>::payload_t;
418 if constexpr(pre_nop)
419 asm volatile("s_nop 4\n"
420 "v_cmpx_le_u32 exec, 1, %4\n"
421 "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
422 "s_mov_b64 exec %5"
423 : "+v"(reinterpret_cast<mbuf_t&>(value))
424 : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
425 : "memory");
426 else
427 asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
428 "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
429 "s_mov_b64 exec %5"
430 : "+v"(reinterpret_cast<mbuf_t&>(value))
431 : "v"(v_offset), "s"(res), "n"(i_offset), "v"(flag), "s"(saved_exec)
432 : "memory");
433 }
434};
435#pragma clang diagnostic pop // "-Wundefined-reinterpret-cast"
436template <index_t bytes>
437struct buffer_store;
438
439template <>
440struct buffer_store<16>
441{
442 template <typename T>
443 CK_TILE_DEVICE void operator()(const T& value,
444 int32x4_t res /*buffer resource*/,
445 index_t v_offset,
446 index_t /*s_offset*/,
447 index_t i_offset /*max 0xFFF*/,
448 index_t /*flag*/ = 1)
449 {
450 static_assert(sizeof(T) == 16);
451 using mbuf_t = fp32x4_t;
452 asm volatile("buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3"
453 :
454 : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
455 : "memory");
456 }
457};
458
459template <>
460struct buffer_store<8>
461{
462 template <typename T>
463 CK_TILE_DEVICE void operator()(const T& value,
464 int32x4_t res /*buffer resource*/,
465 index_t v_offset,
466 index_t /*s_offset*/,
467 index_t i_offset /*max 0xFFF*/,
468 index_t /*flag*/ = 1)
469 {
470 static_assert(sizeof(T) == 8);
471 using mbuf_t = fp32x2_t;
472 asm volatile("buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3"
473 :
474 : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
475 : "memory");
476 }
477};
478
479template <>
480struct buffer_store<4>
481{
482 template <typename T>
483 CK_TILE_DEVICE void operator()(const T& value,
484 int32x4_t res /*buffer resource*/,
485 index_t v_offset,
486 index_t /*s_offset*/,
487 index_t i_offset /*max 0xFFF*/,
488 index_t /*flag*/ = 1)
489 {
490 static_assert(sizeof(T) == 4);
491 using mbuf_t = float;
492 asm volatile("buffer_store_dword %0, %1, %2, 0 offen offset:%3"
493 :
494 : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
495 : "memory");
496 }
497};
498
499template <>
500struct buffer_store<2>
501{
502 template <typename T>
503 CK_TILE_DEVICE void operator()(const T& value,
504 int32x4_t res /*buffer resource*/,
505 index_t v_offset,
506 index_t /*s_offset*/,
507 index_t i_offset /*max 0xFFF*/,
508 index_t /*flag*/ = 1)
509 {
510 static_assert(sizeof(T) == 2);
511 using mbuf_t = short;
512 asm volatile("buffer_store_short %0, %1, %2, 0 offen offset:%3"
513 :
514 : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
515 : "memory");
516 }
517};
518
519template <>
520struct buffer_store<1>
521{
522 template <typename T>
523 CK_TILE_DEVICE void operator()(const T& value,
524 int32x4_t res /*buffer resource*/,
525 index_t v_offset,
526 index_t /*s_offset*/,
527 index_t i_offset /*max 0xFFF*/,
528 index_t /*flag*/ = 1)
529 {
530 static_assert(sizeof(T) == 4);
531 using mbuf_t = float;
532 asm volatile("buffer_store_byte %0, %1, %2, 0 offen offset:%3"
533 :
534 : "v"(bit_cast<mbuf_t>(value)), "v"(v_offset), "s"(res), "n"(i_offset)
535 : "memory");
536 }
537};
538
539template <index_t bytes>
540struct buffer_store_if;
541
542template <>
543struct buffer_store_if<16>
544{
545 template <typename T>
546 CK_TILE_DEVICE void operator()(const T& value,
547 int32x4_t res /*buffer resource*/,
548 index_t v_offset,
549 index_t /*s_offset*/,
550 index_t i_offset /*max 0xFFF*/,
551 index_t flag = 1)
552 {
553 static_assert(sizeof(T) == 16);
554 auto save_exec = __builtin_amdgcn_read_exec();
555 using mbuf_t = fp32x4_t;
556 asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
557 "buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
558 "s_mov_b64 exec %5"
559 :
560 : "v"(bit_cast<mbuf_t>(value)),
561 "v"(v_offset),
562 "s"(res),
563 "n"(i_offset),
564 "v"(flag),
565 "s"(save_exec)
566 : "memory");
567 }
568};
569
570template <>
571struct buffer_store_if<8>
572{
573 template <typename T>
574 CK_TILE_DEVICE void operator()(const T& value,
575 int32x4_t res /*buffer resource*/,
576 index_t v_offset,
577 index_t /*s_offset*/,
578 index_t i_offset /*max 0xFFF*/,
579 index_t flag = 1)
580 {
581 static_assert(sizeof(T) == 8);
582 auto save_exec = __builtin_amdgcn_read_exec();
583 // TODO: ugly. rocm-6.0/6.1 seems neet bit_cast to same base type to avoid scratch
584 using mbuf_t = ext_vector_t<typename T::value_type, T::size()>;
585 asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
586 "buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
587 "s_mov_b64 exec %5"
588 :
589 : "v"(bit_cast<mbuf_t>(value)),
590 "v"(v_offset),
591 "s"(res),
592 "n"(i_offset),
593 "v"(flag),
594 "s"(save_exec)
595 : "memory");
596 }
597};
598
599template <>
600struct buffer_store_if<4>
601{
602 template <typename T>
603 CK_TILE_DEVICE void operator()(const T& value,
604 int32x4_t res /*buffer resource*/,
605 index_t v_offset,
606 index_t /*s_offset*/,
607 index_t i_offset /*max 0xFFF*/,
608 index_t flag = 1)
609 {
610 static_assert(sizeof(T) == 4);
611 auto save_exec = __builtin_amdgcn_read_exec();
612 using mbuf_t = float;
613 asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
614 "buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
615 "s_mov_b64 exec %5"
616 :
617 : "v"(bit_cast<mbuf_t>(value)),
618 "v"(v_offset),
619 "s"(res),
620 "n"(i_offset),
621 "v"(flag),
622 "s"(save_exec)
623 : "memory");
624 }
625};
626
627template <>
628struct buffer_store_if<2>
629{
630 template <typename T>
631 CK_TILE_DEVICE void operator()(const T& value,
632 int32x4_t res /*buffer resource*/,
633 index_t v_offset,
634 index_t /*s_offset*/,
635 index_t i_offset /*max 0xFFF*/,
636 index_t flag = 1)
637 {
638 static_assert(sizeof(T) == 2);
639 auto save_exec = __builtin_amdgcn_read_exec();
640 using mbuf_t = short;
641 asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
642 "buffer_store_short %0, %1, %2, 0 offen offset:%3\n"
643 "s_mov_b64 exec %5"
644 :
645 : "v"(bit_cast<mbuf_t>(value)),
646 "v"(v_offset),
647 "s"(res),
648 "n"(i_offset),
649 "v"(flag),
650 "s"(save_exec)
651 : "memory");
652 }
653};
654
655template <>
656struct buffer_store_if<1>
657{
658 template <typename T>
659 CK_TILE_DEVICE void operator()(const T& value,
660 int32x4_t res /*buffer resource*/,
661 index_t v_offset,
662 index_t /*s_offset*/,
663 index_t i_offset /*max 0xFFF*/,
664 index_t flag = 1)
665 {
666 static_assert(sizeof(T) == 4);
667 auto save_exec = __builtin_amdgcn_read_exec();
668 using mbuf_t = float;
669 asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
670 "buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
671 "s_mov_b64 exec %5"
672 :
673 : "v"(bit_cast<mbuf_t>(value)),
674 "v"(v_offset),
675 "s"(res),
676 "n"(i_offset),
677 "v"(flag),
678 "s"(save_exec)
679 : "memory");
680 }
681};
682
684{
685 asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
686}
687
689{
690 asm volatile("s_waitcnt lgkmcnt(%0)" : : "n"(cnt) : "memory");
691}
692
693template <typename scalar_type, index_t N, bool pre_nop = false>
695
696template <bool pre_nop>
697struct buffer_atomic_add_if<bf16_t, 2, pre_nop>
698{
699 template <typename T>
700 CK_TILE_DEVICE void operator()(const T& value,
701 int32x4_t res /*buffer resource*/,
702 index_t v_offset,
703 index_t /*s_offset*/,
704 index_t i_offset /*max 0xFFF*/,
705 index_t flag = 1)
706 {
707 static_assert(sizeof(T) == 4);
708 auto save_exec = __builtin_amdgcn_read_exec();
709 using mbuf_t = float;
710 asm volatile("v_cmpx_le_u32 exec, 1, %4\n"
711 "global_atomic_pk_add_bf16 %0, %1, %2 offset:%3\n"
712 "s_mov_b64 exec %5"
713 :
714 : "v"(v_offset),
716 "s"(res.xy),
717 "n"(i_offset),
718 "v"(flag),
719 "s"(save_exec)
720 : "memory");
721 }
722};
723
724template <typename scalar_type, index_t N, bool pre_nop = false>
725struct buffer_atomic_add;
726
727template <bool pre_nop>
728struct buffer_atomic_add<bf16_t, 2, pre_nop>
729{
730 template <typename T>
731 CK_TILE_DEVICE void operator()(const T& value,
732 int32x4_t res /*buffer resource*/,
733 index_t v_offset,
734 index_t /*s_offset*/,
735 index_t i_offset /*max 0xFFF*/,
736 index_t /*flag = 1*/)
737 {
738 static_assert(sizeof(T) == 4);
739 using mbuf_t = float;
740 asm volatile("global_atomic_pk_add_bf16 %0, %1, %2 offset:%3"
741 :
742 : "v"(v_offset), "v"(bit_cast<mbuf_t>(value)), "s"(res.xy), "n"(i_offset)
743 : "memory");
744 }
745};
746
747namespace impl {
748// below type indicate the data type used for buffer load inline asm
749// clang-format off
750template<index_t N, typename T> struct smem_load_trait;
751
752template<typename T> struct smem_load_trait<16, T> { using payload_t = fp32x4_t; };
753template<typename T> struct smem_load_trait<8 , T> { using payload_t = fp32x2_t; };
754template<typename T> struct smem_load_trait<4 , T> { using payload_t = float; };
755template<typename T> struct smem_load_trait<2 , T> { using payload_t = float; };
756template<typename T> struct smem_load_trait<1 , T> { using payload_t = float; };
757
758// clang-format on
759} // namespace impl
760
761// NOTE: smem load/store no need pre_nop to make sure dependency by sw, happy :)
762template <index_t>
763struct smem_load;
764
765template <>
766struct smem_load<16>
767{
768 template <typename T>
769 CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
770 {
771 static_assert(sizeof(T) == 16);
772 using mbuf_t = typename impl::smem_load_trait<16, T>::payload_t;
773 asm volatile("ds_read_b128 %0, %1 offset:%2"
774 : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
775 : "v"(v_offset), "n"(i_offset)
776 : "memory");
777 }
778};
779
780template <>
781struct smem_load<8>
782{
783 template <typename T>
784 CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
785 {
786 static_assert(sizeof(T) == 8);
787 using mbuf_t = typename impl::smem_load_trait<8, T>::payload_t;
788 asm volatile("ds_read_b64 %0, %1 offset:%2"
789 : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
790 : "v"(v_offset), "n"(i_offset)
791 : "memory");
792 }
793};
794
795template <>
796struct smem_load<4>
797{
798 template <typename T>
799 CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
800 {
801 static_assert(sizeof(T) == 4);
802 using mbuf_t = typename impl::smem_load_trait<4, T>::payload_t;
803 asm volatile("ds_read_b32 %0, %1 offset:%2"
804 : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
805 : "v"(v_offset), "n"(i_offset)
806 : "memory");
807 }
808};
809
810template <>
811struct smem_load<2>
812{
813 template <typename T>
814 CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
815 {
816 static_assert(sizeof(T) == 4); // subdword is buggy, use dword buf and convert manually
817 using mbuf_t = typename impl::smem_load_trait<1, T>::payload_t;
818 asm volatile("ds_read_u16 %0, %1 offset:%2"
819 : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
820 : "v"(v_offset), "n"(i_offset)
821 : "memory");
822 }
823};
824
825template <>
826struct smem_load<1>
827{
828 template <typename T>
829 CK_TILE_DEVICE void operator()(T& value, index_t v_offset, index_t i_offset)
830 {
831 static_assert(sizeof(T) == 4);
832 using mbuf_t = typename impl::smem_load_trait<1, T>::payload_t;
833 asm volatile("ds_read_u8 %0, %1 offset:%2"
834 : "=v"(reinterpret_cast<mbuf_t&>(value)) // ! direct write
835 : "v"(v_offset), "n"(i_offset)
836 : "memory");
837 }
838};
839
840// clang-format off
841namespace impl{
842
843// can't use "+v" since there could be potential extra move(read/write)
844// use "v" can help remove such duplicated moves
845// besides, fake this as "memory" operation to force later valu after this fence
846// TODO: may have scratch (because this is memory?)
847// need to reduce extra move inside compiler
848template<index_t N>
849CK_TILE_DEVICE void insert_dummy_dep_per_dword(array<float, N>& b)
850{
851 constexpr auto kSize = remove_cvref_t<decltype(b)>::size();
852 static_for<0, kSize, 1>{}([&](auto i){
853 asm volatile(" " : : "v"(b.get(number<i>{})) : "memory");
854 });
855}
856#if 1
857// below specialization just merge size() of dwords into single section
858template<>
859CK_TILE_DEVICE void insert_dummy_dep_per_dword<2>(array<float, 2>& b)
860{
861 asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})) : "memory");
862}
863
864template<>
865CK_TILE_DEVICE void insert_dummy_dep_per_dword<3>(array<float, 3>& b)
866{
867 asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})) : "memory");
868}
869
870template<>
871CK_TILE_DEVICE void insert_dummy_dep_per_dword<4>(array<float, 4>& b)
872{
873 asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})) : "memory");
874}
875
876template<>
877CK_TILE_DEVICE void insert_dummy_dep_per_dword<8>(array<float, 8>& b)
878{
879 asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})),
880 "v"(b.get(number<4>{})), "v"(b.get(number<5>{})), "v"(b.get(number<6>{})), "v"(b.get(number<7>{})) : "memory");
881}
882
883template<>
884CK_TILE_DEVICE void insert_dummy_dep_per_dword<16>(array<float, 16>& b)
885{
886 asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})),
887 "v"(b.get(number<4>{})), "v"(b.get(number<5>{})), "v"(b.get(number<6>{})), "v"(b.get(number<7>{})),
888 "v"(b.get(number<8>{})), "v"(b.get(number<9>{})), "v"(b.get(number<10>{})), "v"(b.get(number<11>{})),
889 "v"(b.get(number<12>{})), "v"(b.get(number<13>{})), "v"(b.get(number<14>{})), "v"(b.get(number<15>{})) : "memory");
890}
891
892template<>
893CK_TILE_DEVICE void insert_dummy_dep_per_dword<32>(array<float, 32>& b)
894{
895 asm volatile(" " : : "v"(b.get(number<0>{})), "v"(b.get(number<1>{})), "v"(b.get(number<2>{})), "v"(b.get(number<3>{})),
896 "v"(b.get(number<4>{})), "v"(b.get(number<5>{})), "v"(b.get(number<6>{})), "v"(b.get(number<7>{})),
897 "v"(b.get(number<8>{})), "v"(b.get(number<9>{})), "v"(b.get(number<10>{})), "v"(b.get(number<11>{})),
898 "v"(b.get(number<12>{})), "v"(b.get(number<13>{})), "v"(b.get(number<14>{})), "v"(b.get(number<15>{})),
899 "v"(b.get(number<16>{})), "v"(b.get(number<17>{})), "v"(b.get(number<18>{})), "v"(b.get(number<19>{})),
900 "v"(b.get(number<20>{})), "v"(b.get(number<21>{})), "v"(b.get(number<22>{})), "v"(b.get(number<23>{})),
901 "v"(b.get(number<24>{})), "v"(b.get(number<25>{})), "v"(b.get(number<26>{})), "v"(b.get(number<27>{})),
902 "v"(b.get(number<28>{})), "v"(b.get(number<29>{})), "v"(b.get(number<30>{})), "v"(b.get(number<31>{})) : "memory");
903}
904#endif
906
907template<typename T>
908CK_TILE_DEVICE void insert_dummy_dep(T & buffer)
909{
910 // TODO: indeed we expect T to be multiple of dword. subdword is always buggy
911 using da_type = array<float, (sizeof(T) + 3) / 4>;
912 auto & dummy = reinterpret_cast<da_type&>(buffer);
914}
915
916template<typename Tx, typename... Ty>
917CK_TILE_DEVICE void insert_dummy_dep(Tx& bx, Ty&... by)
918{
920 insert_dummy_dep(by...);
921}
922}
923// clang-format on
924template <typename... T>
925CK_TILE_DEVICE void buffer_load_fence(index_t cnt = 0, T&... o)
926{
927 asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
929}
930
932{
933 asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
934}
935
937{
938 asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
939}
940
941// buffer load i8
944 index_t voffset,
945 index_t soffset,
946 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8");
947
950 index_t voffset,
951 index_t soffset,
952 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8");
953
956 index_t voffset,
957 index_t soffset,
958 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
959
960// buffer load i16
963 index_t voffset,
964 index_t soffset,
965 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16");
966
969 index_t voffset,
970 index_t soffset,
971 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16");
972
975 index_t voffset,
976 index_t soffset,
977 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16");
978
979// buffer load i32
982 index_t voffset,
983 index_t soffset,
984 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32");
985
988 index_t voffset,
989 index_t soffset,
990 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32");
991
994 index_t voffset,
995 index_t soffset,
996 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
997
998// buffer load fp16
1001 index_t voffset,
1002 index_t soffset,
1003 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16");
1004
1007 index_t voffset,
1008 index_t soffset,
1009 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16");
1010
1013 index_t voffset,
1014 index_t soffset,
1015 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
1016
1017// buffer load fp32
1020 index_t voffset,
1021 index_t soffset,
1022 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32");
1023
1026 index_t voffset,
1027 index_t soffset,
1028 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32");
1029
1032 index_t voffset,
1033 index_t soffset,
1034 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
1035
1036// buffer store i8
1039 int32x4_t rsrc,
1040 index_t voffset,
1041 index_t soffset,
1042 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8");
1043
1046 int32x4_t rsrc,
1047 index_t voffset,
1048 index_t soffset,
1049 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8");
1050
1053 int32x4_t rsrc,
1054 index_t voffset,
1055 index_t soffset,
1056 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
1057
1058// buffer store i16
1061 int32x4_t rsrc,
1062 index_t voffset,
1063 index_t soffset,
1064 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
1065
1068 int32x4_t rsrc,
1069 index_t voffset,
1070 index_t soffset,
1071 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16");
1072
1075 int32x4_t rsrc,
1076 index_t voffset,
1077 index_t soffset,
1078 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16");
1079
1080// buffer store i32
1083 int32x4_t rsrc,
1084 index_t voffset,
1085 index_t soffset,
1086 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32");
1087
1088// buffer store ui16
1091 int32x4_t rsrc,
1092 index_t voffset,
1093 index_t soffset,
1094 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16");
1095
1098 int32x4_t rsrc,
1099 index_t voffset,
1100 index_t soffset,
1101 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16");
1102
1105 int32x4_t rsrc,
1106 index_t voffset,
1107 index_t soffset,
1108 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16");
1109
1112 int32x4_t rsrc,
1113 index_t voffset,
1114 index_t soffset,
1115 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32");
1116
1119 int32x4_t rsrc,
1120 index_t voffset,
1121 index_t soffset,
1122 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
1123
1124// buffer store fp16
1127 int32x4_t rsrc,
1128 index_t voffset,
1129 index_t soffset,
1130 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16");
1131
1134 int32x4_t rsrc,
1135 index_t voffset,
1136 index_t soffset,
1137 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16");
1138
1141 int32x4_t rsrc,
1142 index_t voffset,
1143 index_t soffset,
1144 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
1145
1146// buffer store fp32
1149 int32x4_t rsrc,
1150 index_t voffset,
1151 index_t soffset,
1152 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32");
1153
1156 int32x4_t rsrc,
1157 index_t voffset,
1158 index_t soffset,
1159 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32");
1160
1163 int32x4_t rsrc,
1164 index_t voffset,
1165 index_t soffset,
1166 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
1167
1168// buffer atomic-add fp16
1170 fp16x2_t vdata,
1171 int32x4_t rsrc,
1172 index_t voffset,
1173 index_t soffset,
1174 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
1175
1176// buffer atomic-add bf16
1177// TODO: Replace with bf16x2_t, but llvm builins only accept cktile_bf16x2_t now.
1179 bf16x2_t vdata,
1180 int32x4_t rsrc,
1181 index_t voffset,
1182 index_t soffset,
1183 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16");
1184
1185// buffer atomic-add i32
1187 int32_t vdata,
1188 int32x4_t rsrc,
1189 index_t voffset,
1190 index_t soffset,
1191 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");
1192
1193// buffer atomic-add fp32
1195 float vdata,
1196 int32x4_t rsrc,
1197 index_t voffset,
1198 index_t soffset,
1199 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
1200
1201// buffer atomic-max fp64
1204 int32x4_t rsrc, // dst_wave_buffer_resource
1205 int voffset, // dst_thread_addr_offset
1206 int soffset, // dst_wave_addr_offset
1207 int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
1208
1209// Direct loads from global to LDS.
1212 as3_uint32_ptr lds_ptr,
1213 index_t size,
1214 index_t voffset,
1215 index_t soffset,
1217 index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
1218
1219template <unsigned num_dwords, bool pre_nop = false>
1221 int32x4_t rsrc,
1222 index_t voffset,
1223 index_t /*soffset*/,
1224 index_t ioffset /*max 0xFFF*/,
1225 index_t /*flag*/ = 0,
1227{
1228#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr) \
1229 if constexpr(pre_nop) \
1230 asm volatile("s_nop 4\n" instr " %1, %2, 0 offen offset:%3 lds" \
1231 : "=r"(smem) /*dummy dependency for smem*/ \
1232 : "v"(voffset), "s"(rsrc), "n"(ioffset) \
1233 : "memory"); \
1234 else \
1235 asm volatile(instr " %1, %2, 0 offen offset:%3 lds" \
1236 : "=r"(smem) /*dummy dependency for smem*/ \
1237 : "v"(voffset), "s"(rsrc), "n"(ioffset) \
1238 : "memory");
1239
1240 if constexpr(num_dwords == 1)
1241 {
1242 CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dword");
1243 }
1244#if defined(__gfx950__)
1245 else if constexpr(num_dwords == 3)
1246 {
1247 CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx3");
1248 }
1249 else if constexpr(num_dwords == 4)
1250 {
1251 CK_TILE_ASYNC_LOAD_WITH_INSTR("buffer_load_dwordx4");
1252 }
1253#endif
1254 else
1255 {
1256 static_assert(false, "wrong! not implemented data width");
1257 }
1258#undef CK_TILE_ASYNC_LOAD_WITH_INSTR
1259}
1260
1262{
1263 asm volatile("s_waitcnt vmcnt(%0)" : : "n"(cnt) : "memory");
1264}
1265
1266// memory coherency bit for buffer store/load instruction
1267// check ISA manual for each GFX target
1268// e.g. for
1269// https://www.amd.com/system/files/TechDocs/instinct-mi200-cdna2-instruction-set-architecture.pdf,
1270// page 67~68
1271enum struct amd_buffer_coherence_enum
1272{
1273 coherence_default = 0, // default value
1274 glc = 1,
1275 slc = 2,
1276 glc_slc = 3,
1277 // gfx94: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1
1278 // SC[1:0] System Cache level: 0=wave, 1=group, 2=device, 3=system
1279 // NT Non-Temporal: 0=expect temporal reuse; 1=do not expect temporal reuse
1280 WAVE_NT0 = 0,
1281 WAVE_NT1 = 2,
1282 GROUP_NT0 = 1,
1283 GROUP_NT1 = 3,
1284 DEVICE_NT0 = 8,
1285 DEVICE_NT1 = 10,
1286 SYSTEM_NT0 = 9,
1287 SYSTEM_NT1 = 11,
1288};
1289
1290template <index_t N,
1293amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource,
1294 index_t src_thread_addr_offset,
1295 index_t src_wave_addr_offset)
1296{
1297 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
1298 "wrong! not implemented");
1299
1300 using rtn_type = thread_buffer<int8_t, N>;
1301
1302 if constexpr(N == 1)
1303 {
1304 return bit_cast<rtn_type>(llvm_amdgcn_raw_buffer_load_i8(src_wave_buffer_resource,
1305 src_thread_addr_offset,
1306 src_wave_addr_offset,
1307 static_cast<index_t>(coherence)));
1308 }
1309 else if constexpr(N == 2)
1310 {
1311
1312 int16_t tmp = llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
1313 src_thread_addr_offset,
1314 src_wave_addr_offset,
1315 static_cast<index_t>(coherence));
1316
1317 return bit_cast<rtn_type>(tmp);
1318 }
1319 else if constexpr(N == 4)
1320 {
1321 int32_t tmp = llvm_amdgcn_raw_buffer_load_i32(src_wave_buffer_resource,
1322 src_thread_addr_offset,
1323 src_wave_addr_offset,
1324 static_cast<index_t>(coherence));
1325
1326 return bit_cast<rtn_type>(tmp);
1327 }
1328 else if constexpr(N == 8)
1329 {
1330 int32x2_t tmp = llvm_amdgcn_raw_buffer_load_i32x2(src_wave_buffer_resource,
1331 src_thread_addr_offset,
1332 src_wave_addr_offset,
1333 static_cast<index_t>(coherence));
1334
1335 return bit_cast<rtn_type>(tmp);
1336 }
1337 else if constexpr(N == 16)
1338 {
1339 int32x4_t tmp = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
1340 src_thread_addr_offset,
1341 src_wave_addr_offset,
1342 static_cast<index_t>(coherence));
1343 return bit_cast<rtn_type>(tmp);
1344 }
1345 else if constexpr(N == 32)
1346 {
1347 int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
1348 src_thread_addr_offset,
1349 src_wave_addr_offset,
1350 static_cast<index_t>(coherence));
1351 int32x4_t tmp1 =
1352 llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
1353 src_thread_addr_offset,
1354 src_wave_addr_offset + 4 * sizeof(int32_t),
1355 static_cast<index_t>(coherence));
1357
1358 tmp.template get_as<int32x4_t>()(number<0>{}) = tmp0;
1359 tmp.template get_as<int32x4_t>()(number<1>{}) = tmp1;
1360
1361 return bit_cast<rtn_type>(tmp);
1362 }
1363 else if constexpr(N == 64)
1364 {
1365 int32x4_t tmp0 = llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
1366 src_thread_addr_offset,
1367 src_wave_addr_offset,
1368 static_cast<index_t>(coherence));
1369 int32x4_t tmp1 =
1370 llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
1371 src_thread_addr_offset,
1372 src_wave_addr_offset + 4 * sizeof(int32_t),
1373 static_cast<index_t>(coherence));
1374 int32x4_t tmp2 =
1375 llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
1376 src_thread_addr_offset,
1377 src_wave_addr_offset + 8 * sizeof(int32_t),
1378 static_cast<index_t>(coherence));
1379 int32x4_t tmp3 =
1380 llvm_amdgcn_raw_buffer_load_i32x4(src_wave_buffer_resource,
1381 src_thread_addr_offset,
1382 src_wave_addr_offset + 12 * sizeof(int32_t),
1383 static_cast<index_t>(coherence));
1384
1386
1387 tmp.template get_as<int32x4_t>()(number<0>{}) = tmp0;
1388 tmp.template get_as<int32x4_t>()(number<1>{}) = tmp1;
1389 tmp.template get_as<int32x4_t>()(number<2>{}) = tmp2;
1390 tmp.template get_as<int32x4_t>()(number<3>{}) = tmp3;
1391
1392 return bit_cast<rtn_type>(tmp);
1393 }
1394}
1395
1396#ifndef BUFFER_LOAD_USE_INLINEASM
1397#define BUFFER_LOAD_USE_INLINEASM 0
1398#endif
1399
1400template <typename T,
1401 index_t N,
1404 index_t src_thread_addr_offset,
1405 index_t src_wave_addr_offset)
1406{
1407 static_assert(
1408 (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
1409 (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1410 (std::is_same<T, fp16_t>::value &&
1411 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
1412 (std::is_same<T, bf16_t>::value &&
1413 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
1414 (std::is_same<T, int32_t>::value &&
1415 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1416 (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1417 (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1418 (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1419 (std::is_same<T, e8m0_bexp_t>::value &&
1420 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1421 (std::is_same<T, pk_fp4_raw_t>::value &&
1422 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1423 (std::is_same<T, pk_int4_t>::value &&
1424 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32) ||
1425 (std::is_same<T, pk_fp4_t>::value &&
1426 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16))),
1427 "wrong! not implemented");
1428
1429 using rtn_type = thread_buffer<T, N>;
1430
1431 if constexpr(std::is_same<T, float>::value) // fp32
1432 {
1433 if constexpr(N == 1)
1434 {
1435 return bit_cast<rtn_type>(
1436 llvm_amdgcn_raw_buffer_load_fp32(src_wave_buffer_resource,
1437 src_thread_addr_offset,
1438 src_wave_addr_offset,
1439 static_cast<index_t>(coherence)));
1440 }
1441 else if constexpr(N == 2)
1442 {
1443 return bit_cast<rtn_type>(
1444 llvm_amdgcn_raw_buffer_load_fp32x2(src_wave_buffer_resource,
1445 src_thread_addr_offset,
1446 src_wave_addr_offset,
1447 static_cast<index_t>(coherence)));
1448 }
1449 else if constexpr(N == 4)
1450 {
1451 return bit_cast<rtn_type>(
1452 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
1453 src_thread_addr_offset,
1454 src_wave_addr_offset,
1455 static_cast<index_t>(coherence)));
1456 }
1457 else if constexpr(N == 8)
1458 {
1460
1461 tmp.template get_as<fp32x4_t>()(number<0>{}) =
1462 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
1463 src_thread_addr_offset,
1464 src_wave_addr_offset,
1465 static_cast<index_t>(coherence));
1466
1467 tmp.template get_as<fp32x4_t>()(number<1>{}) =
1468 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
1469 src_thread_addr_offset,
1470 src_wave_addr_offset + 4 * sizeof(float),
1471 static_cast<index_t>(coherence));
1472
1473 return tmp;
1474 }
1475 else if constexpr(N == 16)
1476 {
1478
1479 tmp.template get_as<fp32x4_t>()(number<0>{}) =
1480 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
1481 src_thread_addr_offset,
1482 src_wave_addr_offset,
1483 static_cast<index_t>(coherence));
1484
1485 tmp.template get_as<fp32x4_t>()(number<1>{}) =
1486 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
1487 src_thread_addr_offset,
1488 src_wave_addr_offset + 4 * sizeof(float),
1489 static_cast<index_t>(coherence));
1490
1491 tmp.template get_as<fp32x4_t>()(number<2>{}) =
1492 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
1493 src_thread_addr_offset,
1494 src_wave_addr_offset + 8 * sizeof(float),
1495 static_cast<index_t>(coherence));
1496
1497 tmp.template get_as<fp32x4_t>()(number<3>{}) =
1498 llvm_amdgcn_raw_buffer_load_fp32x4(src_wave_buffer_resource,
1499 src_thread_addr_offset,
1500 src_wave_addr_offset + 12 * sizeof(float),
1501 static_cast<index_t>(coherence));
1502
1503 return tmp;
1504 }
1505 }
1506 else if constexpr(std::is_same<T, fp16_t>::value) // fp16
1507 {
1508 if constexpr(N == 1)
1509 {
1510 return bit_cast<rtn_type>(
1511 llvm_amdgcn_raw_buffer_load_fp16(src_wave_buffer_resource,
1512 src_thread_addr_offset,
1513 src_wave_addr_offset,
1514 static_cast<index_t>(coherence)));
1515 }
1516 else if constexpr(N == 2)
1517 {
1518 return bit_cast<rtn_type>(
1519 llvm_amdgcn_raw_buffer_load_fp16x2(src_wave_buffer_resource,
1520 src_thread_addr_offset,
1521 src_wave_addr_offset,
1522 static_cast<index_t>(coherence)));
1523 }
1524 else if constexpr(N == 4)
1525 {
1526 return bit_cast<rtn_type>(
1527 llvm_amdgcn_raw_buffer_load_fp16x4(src_wave_buffer_resource,
1528 src_thread_addr_offset,
1529 src_wave_addr_offset,
1530 static_cast<index_t>(coherence)));
1531 }
1532 else
1533 {
1534 // N >= 8: build from fp32x4 chunks
1535 thread_buffer<float, N / 2> tmp;
1536
1537 static_for<0, (N / 8), 1>{}([&](auto i) {
1538 constexpr index_t chunk = i;
1539 tmp.template get_as<fp32x4_t>()(i) = llvm_amdgcn_raw_buffer_load_fp32x4(
1540 src_wave_buffer_resource,
1541 src_thread_addr_offset,
1542 src_wave_addr_offset + (chunk * 4) * sizeof(float),
1543 static_cast<index_t>(coherence));
1544 });
1545 return bit_cast<rtn_type>(tmp);
1546 }
1547 }
1548 else if constexpr(std::is_same<T, bf16_t>::value) // bf16
1549 {
1550 if constexpr(N == 1)
1551 {
1552 return bit_cast<rtn_type>(
1553 llvm_amdgcn_raw_buffer_load_i16(src_wave_buffer_resource,
1554 src_thread_addr_offset,
1555 src_wave_addr_offset,
1556 static_cast<index_t>(coherence)));
1557 }
1558 else if constexpr(N == 2)
1559 {
1560 return bit_cast<rtn_type>(
1561 llvm_amdgcn_raw_buffer_load_i16x2(src_wave_buffer_resource,
1562 src_thread_addr_offset,
1563 src_wave_addr_offset,
1564 static_cast<index_t>(coherence)));
1565 }
1566 else if constexpr(N == 4)
1567 {
1568 return bit_cast<rtn_type>(
1569 llvm_amdgcn_raw_buffer_load_i16x4(src_wave_buffer_resource,
1570 src_thread_addr_offset,
1571 src_wave_addr_offset,
1572 static_cast<index_t>(coherence)));
1573 }
1574 else
1575 {
1576 // N >= 8: build from fp32x4 chunks
1577 thread_buffer<float, N / 2> tmp;
1578
1579 static_for<0, (N / 8), 1>{}([&](auto i) {
1580 constexpr index_t chunk = i;
1581 tmp.template get_as<fp32x4_t>()(i) = llvm_amdgcn_raw_buffer_load_fp32x4(
1582 src_wave_buffer_resource,
1583 src_thread_addr_offset,
1584 src_wave_addr_offset + (chunk * 4) * sizeof(float),
1585 static_cast<index_t>(coherence));
1586 });
1587 return bit_cast<rtn_type>(tmp);
1588 }
1589 }
1590 else // other datatype
1591 {
1593 src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
1594
1595 return bit_cast<rtn_type>(raw_data);
1596 }
1597}
1598
1599template <typename T,
1600 index_t N,
1602 bool oob_conditional_check = true,
1603 bool pre_nop = false>
1605 int32x4_t src_wave_buffer_resource,
1606 index_t src_thread_addr_offset,
1607 index_t src_wave_addr_offset,
1608 index_t src_linear_addr_offset,
1609 index_t flag = 0,
1611{
1612 constexpr index_t bytes = sizeof(T) * N;
1613 static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
1614 "wrong! not supported by buffer_load instruction");
1615
1616 using type = thread_buffer<T, N>;
1617 if constexpr(oob_conditional_check)
1618 {
1619 buffer_load_if<sizeof(type), pre_nop>{}(dst,
1620 src_wave_buffer_resource,
1621 src_thread_addr_offset,
1622 src_wave_addr_offset,
1623 src_linear_addr_offset,
1624 flag,
1626 }
1627 else
1628 {
1629 buffer_load<sizeof(type), pre_nop>{}(dst,
1630 src_wave_buffer_resource,
1631 src_thread_addr_offset,
1632 src_wave_addr_offset,
1633 src_linear_addr_offset,
1634 flag,
1636 }
1637}
1638
1639template <typename T,
1640 index_t N,
1642 bool pre_nop = false>
1644 int32x4_t src_wave_buffer_resource,
1645 index_t src_thread_addr_offset,
1646 index_t src_wave_addr_offset,
1647 index_t src_immediate_addr_offset = 0,
1649{
1650 constexpr index_t num_bytes = sizeof(T) * N;
1651 constexpr index_t num_words = num_bytes / 4;
1652 static_assert(num_bytes % 4 == 0 && (num_words == 1 || num_words == 3 || num_words == 4),
1653 "wrong! only support in dword, dwordx3, dwordx4");
1654
1656 src_wave_buffer_resource,
1657 src_thread_addr_offset,
1658 src_wave_addr_offset,
1659 src_immediate_addr_offset,
1660 0,
1662}
1663
1664template <typename T,
1665 index_t N,
1667 bool oob_conditional_check = true>
1669 int32x4_t src_wave_buffer_resource,
1670 index_t src_thread_addr_offset,
1671 index_t src_wave_addr_offset,
1672 index_t src_immediate_addr_offset = 0,
1673 index_t flag = 0,
1675{
1676 constexpr index_t bytes = sizeof(T) * N;
1677
1678 // Used to catch the cases when src_immediate_addr_offset is NOT 0.
1679 // Remove this assert once other sizes are implemented.
1680 assert(src_immediate_addr_offset == 0 &&
1681 "wrong! not implemented src_immediate_addr_offset size, only 0 supported");
1682 ignore = src_immediate_addr_offset;
1683
1684#if defined(__gfx950__)
1685 static_assert(bytes == 4 || bytes == 12 || bytes == 16,
1686 "wrong! only support in dword, dwordx3, dwordx4");
1687 src_wave_addr_offset = 0;
1688#else
1689 static_assert(bytes == 4, "wrong! not implemented vector size");
1690#endif
1691
1692 // Set up v_offset:
1693 index_t v_offset = src_thread_addr_offset;
1694 if constexpr(oob_conditional_check)
1695 v_offset = flag ? v_offset : src_wave_buffer_resource[2];
1696
1697#pragma clang diagnostic push
1698#pragma clang diagnostic ignored "-Wold-style-cast"
1699 // Use C-style cast to change address space without dropping llvm noalias attribute
1700 llvm_amdgcn_raw_buffer_load_lds(src_wave_buffer_resource,
1701 (as3_uint32_ptr)(smem),
1702 bytes,
1703 v_offset,
1704 src_wave_addr_offset,
1705 /*src_immediate_addr_offset*/ 0,
1706 static_cast<index_t>(coherence));
1707#pragma clang diagnostic pop
1708}
1709
1710template <index_t N,
1713 int32x4_t dst_wave_buffer_resource,
1714 index_t dst_thread_addr_offset,
1715 index_t dst_wave_addr_offset)
1716{
1717 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
1718 "wrong! not implemented");
1719
1720 if constexpr(N == 1)
1721 {
1723 dst_wave_buffer_resource,
1724 dst_thread_addr_offset,
1725 dst_wave_addr_offset,
1726 static_cast<index_t>(coherence));
1727 }
1728 else if constexpr(N == 2)
1729 {
1730
1732 dst_wave_buffer_resource,
1733 dst_thread_addr_offset,
1734 dst_wave_addr_offset,
1735 static_cast<index_t>(coherence));
1736 }
1737 else if constexpr(N == 4)
1738 {
1740 dst_wave_buffer_resource,
1741 dst_thread_addr_offset,
1742 dst_wave_addr_offset,
1743 static_cast<index_t>(coherence));
1744 }
1745 else if constexpr(N == 8)
1746 {
1748 dst_wave_buffer_resource,
1749 dst_thread_addr_offset,
1750 dst_wave_addr_offset,
1751 static_cast<index_t>(coherence));
1752 }
1753 else if constexpr(N == 16)
1754 {
1756 dst_wave_buffer_resource,
1757 dst_thread_addr_offset,
1758 dst_wave_addr_offset,
1759 static_cast<index_t>(coherence));
1760 }
1761 else if constexpr(N == 32)
1762 {
1764 src_thread_data.template get_as<int32x4_t>()[number<0>{}],
1765 dst_wave_buffer_resource,
1766 dst_thread_addr_offset,
1767 dst_wave_addr_offset,
1768 static_cast<index_t>(coherence));
1769
1771 src_thread_data.template get_as<int32x4_t>()[number<1>{}],
1772 dst_wave_buffer_resource,
1773 dst_thread_addr_offset,
1774 dst_wave_addr_offset + sizeof(int32_t) * 4,
1775 static_cast<index_t>(coherence));
1776 }
1777 else if constexpr(N == 64)
1778 {
1780 src_thread_data.template get_as<int32x4_t>()[number<0>{}],
1781 dst_wave_buffer_resource,
1782 dst_thread_addr_offset,
1783 dst_wave_addr_offset,
1784 static_cast<index_t>(coherence));
1785
1787 src_thread_data.template get_as<int32x4_t>()[number<1>{}],
1788 dst_wave_buffer_resource,
1789 dst_thread_addr_offset,
1790 dst_wave_addr_offset + sizeof(int32_t) * 4,
1791 static_cast<index_t>(coherence));
1792
1794 src_thread_data.template get_as<int32x4_t>()[number<2>{}],
1795 dst_wave_buffer_resource,
1796 dst_thread_addr_offset,
1797 dst_wave_addr_offset + sizeof(int32_t) * 8,
1798 static_cast<index_t>(coherence));
1799
1801 src_thread_data.template get_as<int32x4_t>()[number<3>{}],
1802 dst_wave_buffer_resource,
1803 dst_thread_addr_offset,
1804 dst_wave_addr_offset + sizeof(int32_t) * 12,
1805 static_cast<index_t>(coherence));
1806 }
1807}
1808
1809template <typename T,
1810 index_t N,
1813 int32x4_t dst_wave_buffer_resource,
1814 index_t dst_thread_addr_offset,
1815 index_t dst_wave_addr_offset)
1816{
1817 static_assert(
1818 (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
1819 (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1820 (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1821 (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1822 (std::is_same<T, int32_t>::value &&
1823 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1824 (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1825 (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1826 (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1827 (std::is_same<T, uint16_t>::value &&
1828 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1829 (std::is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
1830 "wrong! not implemented");
1831
1832 if constexpr(std::is_same<T, float>::value) // fp32
1833 {
1834 if constexpr(N == 1)
1835 {
1837 dst_wave_buffer_resource,
1838 dst_thread_addr_offset,
1839 dst_wave_addr_offset,
1840 static_cast<index_t>(coherence));
1841 }
1842 else if constexpr(N == 2)
1843 {
1845 dst_wave_buffer_resource,
1846 dst_thread_addr_offset,
1847 dst_wave_addr_offset,
1848 static_cast<index_t>(coherence));
1849 }
1850 else if constexpr(N == 4)
1851 {
1853 dst_wave_buffer_resource,
1854 dst_thread_addr_offset,
1855 dst_wave_addr_offset,
1856 static_cast<index_t>(coherence));
1857 }
1858 else if constexpr(N == 8)
1859 {
1861 src_thread_data.template get_as<fp32x4_t>()[number<0>{}],
1862 dst_wave_buffer_resource,
1863 dst_thread_addr_offset,
1864 dst_wave_addr_offset,
1865 static_cast<index_t>(coherence));
1867 src_thread_data.template get_as<fp32x4_t>()[number<1>{}],
1868 dst_wave_buffer_resource,
1869 dst_thread_addr_offset,
1870 dst_wave_addr_offset + 4 * sizeof(float),
1871 static_cast<index_t>(coherence));
1872 }
1873 }
1874 else if constexpr(std::is_same<T, fp16_t>::value) // fp16
1875 {
1876 if constexpr(N == 1)
1877 {
1879 dst_wave_buffer_resource,
1880 dst_thread_addr_offset,
1881 dst_wave_addr_offset,
1882 static_cast<index_t>(coherence));
1883 }
1884 else if constexpr(N == 2)
1885 {
1887 dst_wave_buffer_resource,
1888 dst_thread_addr_offset,
1889 dst_wave_addr_offset,
1890 static_cast<index_t>(coherence));
1891 }
1892 else if constexpr(N == 4)
1893 {
1895 dst_wave_buffer_resource,
1896 dst_thread_addr_offset,
1897 dst_wave_addr_offset,
1898 static_cast<index_t>(coherence));
1899 }
1900 else if constexpr(N == 8)
1901 {
1902#if 0
1903 thread_buffer<fp16_t, 8> tmp{src_thread_data};
1904
1905 llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<0>{}],
1906 dst_wave_buffer_resource,
1907 dst_thread_addr_offset,
1908 dst_wave_addr_offset,
1909 static_cast<index_t>(coherence));
1910
1911 llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<1>{}],
1912 dst_wave_buffer_resource,
1913 dst_thread_addr_offset,
1914 dst_wave_addr_offset + 4 * sizeof(fp16_t),
1915 static_cast<index_t>(coherence));
1916#else
1918 dst_wave_buffer_resource,
1919 dst_thread_addr_offset,
1920 dst_wave_addr_offset,
1921 static_cast<index_t>(coherence));
1922#endif
1923 }
1924 }
1925 else if constexpr(std::is_same<T, bf16_t>::value) // bf16
1926 {
1927 if constexpr(N == 1)
1928 {
1930 dst_wave_buffer_resource,
1931 dst_thread_addr_offset,
1932 dst_wave_addr_offset,
1933 static_cast<index_t>(coherence));
1934 }
1935 else if constexpr(N == 2)
1936 {
1938 dst_wave_buffer_resource,
1939 dst_thread_addr_offset,
1940 dst_wave_addr_offset,
1941 static_cast<index_t>(coherence));
1942 }
1943 else if constexpr(N == 4)
1944 {
1946 dst_wave_buffer_resource,
1947 dst_thread_addr_offset,
1948 dst_wave_addr_offset,
1949 static_cast<index_t>(coherence));
1950 }
1951 else if constexpr(N == 8)
1952 {
1954 src_thread_data.template get_as<int16x4_t>()[number<0>{}],
1955 dst_wave_buffer_resource,
1956 dst_thread_addr_offset,
1957 dst_wave_addr_offset,
1958 static_cast<index_t>(coherence));
1959
1961 src_thread_data.template get_as<int16x4_t>()[number<1>{}],
1962 dst_wave_buffer_resource,
1963 dst_thread_addr_offset,
1964 dst_wave_addr_offset + 4 * sizeof(bf16_t),
1965 static_cast<index_t>(coherence));
1966 }
1967 }
1968 else if constexpr(std::is_same<T, uint16_t>::value)
1969 {
1970 if constexpr(N == 1)
1971 {
1973 dst_wave_buffer_resource,
1974 dst_thread_addr_offset,
1975 dst_wave_addr_offset,
1976 static_cast<index_t>(coherence));
1977 }
1978 else if constexpr(N == 2)
1979 {
1981 dst_wave_buffer_resource,
1982 dst_thread_addr_offset,
1983 dst_wave_addr_offset,
1984 static_cast<index_t>(coherence));
1985 }
1986 else if constexpr(N == 4)
1987 {
1989 dst_wave_buffer_resource,
1990 dst_thread_addr_offset,
1991 dst_wave_addr_offset,
1992 static_cast<index_t>(coherence));
1993 }
1994 else if constexpr(N == 8)
1995 {
1997 src_thread_data.template get_as<uint16x4_t>()[number<0>{}],
1998 dst_wave_buffer_resource,
1999 dst_thread_addr_offset,
2000 dst_wave_addr_offset,
2001 static_cast<index_t>(coherence));
2002
2004 src_thread_data.template get_as<uint16x4_t>()[number<1>{}],
2005 dst_wave_buffer_resource,
2006 dst_thread_addr_offset,
2007 dst_wave_addr_offset + 4 * sizeof(uint16_t),
2008 static_cast<index_t>(coherence));
2009 }
2010 }
2011 else
2012 {
2013 using r_t = thread_buffer<int8_t, sizeof(T) * N>;
2014
2016 dst_wave_buffer_resource,
2017 dst_thread_addr_offset,
2018 dst_wave_addr_offset);
2019 }
2020}
2021
2022template <typename T,
2023 index_t N,
2025 bool oob_conditional_check = true>
2027 int32x4_t dst_wave_buffer_resource,
2028 index_t dst_thread_addr_offset,
2029 index_t dst_wave_addr_offset,
2030 index_t dst_linear_addr_offset,
2031 index_t is_valid_element = 1)
2032{
2033 constexpr index_t bytes = sizeof(T) * N;
2034 static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
2035 "wrong! not supported by buffer_store instruction");
2036
2037 using type = thread_buffer<T, N>;
2038 if constexpr(oob_conditional_check)
2039 {
2040 buffer_store_if<sizeof(type)>{}(dst_thread_data,
2041 dst_wave_buffer_resource,
2042 dst_thread_addr_offset,
2043 dst_wave_addr_offset,
2044 dst_linear_addr_offset,
2045 is_valid_element);
2046 }
2047 else
2048 {
2049 buffer_store<sizeof(type)>{}(dst_thread_data,
2050 dst_wave_buffer_resource,
2051 dst_thread_addr_offset,
2052 dst_wave_addr_offset,
2053 dst_linear_addr_offset);
2054 }
2055}
2056
2057template <typename T, index_t N>
2059 int32x4_t dst_wave_buffer_resource,
2060 index_t dst_thread_addr_offset,
2061 index_t dst_wave_addr_offset)
2062{
2063 static_assert((std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
2064 (std::is_same<T, fp16_t>::value && (N == 2 || N == 4 || N == 8)) ||
2065 (std::is_same<T, bf16_t>::value && (N == 2 || N == 4 || N == 8)) ||
2066 (std::is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
2067 "wrong! not implemented");
2068
2069 if constexpr(std::is_same<T, float>::value)
2070 {
2071 if constexpr(N == 1)
2072 {
2074 dst_wave_buffer_resource,
2075 dst_thread_addr_offset,
2076 dst_wave_addr_offset,
2077 0);
2078 }
2079 else if constexpr(N == 2)
2080 {
2082 src_thread_data.template get_as<float>()[number<0>{}],
2083 dst_wave_buffer_resource,
2084 dst_thread_addr_offset,
2085 dst_wave_addr_offset,
2086 0);
2087
2089 src_thread_data.template get_as<float>()[number<1>{}],
2090 dst_wave_buffer_resource,
2091 dst_thread_addr_offset,
2092 dst_wave_addr_offset + sizeof(float),
2093 0);
2094 }
2095 else if constexpr(N == 4)
2096 {
2098 src_thread_data.template get_as<float>()[number<0>{}],
2099 dst_wave_buffer_resource,
2100 dst_thread_addr_offset,
2101 dst_wave_addr_offset,
2102 0);
2103
2105 src_thread_data.template get_as<float>()[number<1>{}],
2106 dst_wave_buffer_resource,
2107 dst_thread_addr_offset,
2108 dst_wave_addr_offset + sizeof(float),
2109 0);
2110
2112 src_thread_data.template get_as<float>()[number<2>{}],
2113 dst_wave_buffer_resource,
2114 dst_thread_addr_offset,
2115 dst_wave_addr_offset + 2 * sizeof(float),
2116 0);
2117
2119 src_thread_data.template get_as<float>()[number<3>{}],
2120 dst_wave_buffer_resource,
2121 dst_thread_addr_offset,
2122 dst_wave_addr_offset + 3 * sizeof(float),
2123 0);
2124 }
2125 }
2126 else if constexpr(std::is_same<T, fp16_t>::value)
2127 {
2128 if constexpr(N == 2)
2129 {
2131 dst_wave_buffer_resource,
2132 dst_thread_addr_offset,
2133 dst_wave_addr_offset,
2134 0);
2135 }
2136 else if constexpr(N == 4)
2137 {
2138 static_for<0, 2, 1>{}([&](auto i) {
2140 src_thread_data.template get_as<fp16x2_t>()[i],
2141 dst_wave_buffer_resource,
2142 dst_thread_addr_offset,
2143 dst_wave_addr_offset + i * sizeof(fp16x2_t),
2144 0);
2145 });
2146 }
2147 else if constexpr(N == 8)
2148 {
2149 static_for<0, 4, 1>{}([&](auto i) {
2151 src_thread_data.template get_as<fp16x2_t>()[i],
2152 dst_wave_buffer_resource,
2153 dst_thread_addr_offset,
2154 dst_wave_addr_offset + i * sizeof(fp16x2_t),
2155 0);
2156 });
2157 }
2158 }
2159 else if constexpr(std::is_same<T, bf16_t>::value)
2160 {
2161 if constexpr(N == 2)
2162 {
2164 dst_wave_buffer_resource,
2165 dst_thread_addr_offset,
2166 dst_wave_addr_offset,
2167 0);
2168 }
2169 else if constexpr(N == 4)
2170 {
2171 static_for<0, 2, 1>{}([&](auto i) {
2173 src_thread_data.template get_as<bf16x2_t>()[i],
2174 dst_wave_buffer_resource,
2175 dst_thread_addr_offset,
2176 dst_wave_addr_offset + i * sizeof(bf16x2_t),
2177 0);
2178 });
2179 }
2180 else if constexpr(N == 8)
2181 {
2182 static_for<0, 4, 1>{}([&](auto i) {
2184 src_thread_data.template get_as<bf16x2_t>()[i],
2185 dst_wave_buffer_resource,
2186 dst_thread_addr_offset,
2187 dst_wave_addr_offset + i * sizeof(bf16x2_t),
2188 0);
2189 });
2190 }
2191 }
2192 else if constexpr(std::is_same<T, int32_t>::value)
2193 {
2194 if constexpr(N == 1)
2195 {
2197 dst_wave_buffer_resource,
2198 dst_thread_addr_offset,
2199 dst_wave_addr_offset,
2200 0);
2201 }
2202 else if constexpr(N == 2)
2203 {
2205 src_thread_data.template get_as<int32_t>()[number<0>{}],
2206 dst_wave_buffer_resource,
2207 dst_thread_addr_offset,
2208 dst_wave_addr_offset,
2209 0);
2210
2212 src_thread_data.template get_as<int32_t>()[number<1>{}],
2213 dst_wave_buffer_resource,
2214 dst_thread_addr_offset,
2215 dst_wave_addr_offset + sizeof(int32_t),
2216 0);
2217 }
2218 else if constexpr(N == 4)
2219 {
2221 src_thread_data.template get_as<int32_t>()[number<0>{}],
2222 dst_wave_buffer_resource,
2223 dst_thread_addr_offset,
2224 dst_wave_addr_offset,
2225 0);
2226
2228 src_thread_data.template get_as<int32_t>()[number<1>{}],
2229 dst_wave_buffer_resource,
2230 dst_thread_addr_offset,
2231 dst_wave_addr_offset + sizeof(int32_t),
2232 0);
2233
2235 src_thread_data.template get_as<int32_t>()[number<2>{}],
2236 dst_wave_buffer_resource,
2237 dst_thread_addr_offset,
2238 dst_wave_addr_offset + 2 * sizeof(int32_t),
2239 0);
2240
2242 src_thread_data.template get_as<int32_t>()[number<3>{}],
2243 dst_wave_buffer_resource,
2244 dst_thread_addr_offset,
2245 dst_wave_addr_offset + 3 * sizeof(int32_t),
2246 0);
2247 }
2248 }
2249}
2250
2251template <typename T, index_t N>
2253 int32x4_t dst_wave_buffer_resource,
2254 index_t dst_thread_addr_offset,
2255 index_t dst_wave_addr_offset)
2256{
2257 static_assert((std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
2258 "wrong! not implemented");
2259 if constexpr(std::is_same<T, double>::value)
2260 {
2261 if constexpr(N == 1)
2262 {
2264 dst_wave_buffer_resource,
2265 dst_thread_addr_offset,
2266 dst_wave_addr_offset,
2267 0);
2268 }
2269 else if constexpr(N == 2)
2270 {
2272 src_thread_data.template get_as<double>()[number<0>{}],
2273 dst_wave_buffer_resource,
2274 dst_thread_addr_offset,
2275 dst_wave_addr_offset,
2276 0);
2277
2279 src_thread_data.template get_as<double>()[number<1>{}],
2280 dst_wave_buffer_resource,
2281 dst_thread_addr_offset,
2282 dst_wave_addr_offset + sizeof(double),
2283 0);
2284 }
2285 else if constexpr(N == 4)
2286 {
2288 src_thread_data.template get_as<double>()[number<0>{}],
2289 dst_wave_buffer_resource,
2290 dst_thread_addr_offset,
2291 dst_wave_addr_offset,
2292 0);
2293
2295 src_thread_data.template get_as<double>()[number<1>{}],
2296 dst_wave_buffer_resource,
2297 dst_thread_addr_offset,
2298 dst_wave_addr_offset + sizeof(double),
2299 0);
2300
2302 src_thread_data.template get_as<double>()[number<2>{}],
2303 dst_wave_buffer_resource,
2304 dst_thread_addr_offset,
2305 dst_wave_addr_offset + 2 * sizeof(double),
2306 0);
2307
2309 src_thread_data.template get_as<double>()[number<3>{}],
2310 dst_wave_buffer_resource,
2311 dst_thread_addr_offset,
2312 dst_wave_addr_offset + 3 * sizeof(double),
2313 0);
2314 }
2315 }
2316}
2317
2318// buffer_load requires:
2319// 1) p_src_wave must point to global memory space
2320// 2) p_src_wave must be a wavewise pointer.
2321// It is user's responsibility to make sure that is true.
2322// oob_conditional_check : dynamic check if out-of-bound
2323template <typename T,
2324 index_t N,
2326 bool oob_conditional_check = true>
2329 index_t src_thread_element_offset,
2330 bool src_thread_element_valid,
2331 index_t src_element_space_size)
2332{
2333 const int32x4_t src_wave_buffer_resource =
2334 make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
2335
2336 index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
2337
2338#if CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
2339 uint32_t src_addr_shift = [&]() {
2340 if constexpr(oob_conditional_check)
2341 return src_thread_element_valid ? 0 : 0x80000000;
2342 else
2343 return 0;
2344 }();
2346 src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
2347#else
2349 amd_buffer_load_impl<T, N, coherence>(src_wave_buffer_resource, src_thread_addr_offset, 0);
2350 if constexpr(oob_conditional_check)
2351 return src_thread_element_valid ? tmp : thread_buffer<T, N>{numeric<T>::zero()};
2352 else
2353 return tmp;
2354#endif
2355}
2356
2357// buffer_load requires:
2358// 1) p_src_wave must point to global memory space
2359// 2) p_src_wave must be a wavewise pointer.
2360// It is user's responsibility to make sure that is true.
2361template <typename T,
2362 index_t N,
2364 bool oob_conditional_check = true>
2367 index_t src_thread_element_offset,
2368 bool src_thread_element_valid,
2369 index_t src_element_space_size,
2370 T customized_value)
2371{
2372 const int32x4_t src_wave_buffer_resource =
2373 make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
2374
2375 index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
2376
2378 amd_buffer_load_impl<T, N, coherence>(src_wave_buffer_resource, src_thread_addr_offset, 0);
2379
2380 if constexpr(oob_conditional_check)
2381 return src_thread_element_valid ? tmp : thread_buffer<T, N>{customized_value};
2382 else
2383 return tmp;
2384}
2385
2386template <typename T,
2387 index_t N,
2389 bool oob_conditional_check = true,
2390 bool pre_nop = false>
2392 const T* p_src_wave,
2393 index_t src_thread_element_offset,
2394 index_t src_linear_element_offset,
2395 index_t src_element_space_size,
2396 index_t is_valid_element = 0,
2398{
2399 const int32x4_t src_wave_buffer_resource =
2400 make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
2401
2402 index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
2403 index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
2404
2406 dst,
2407 src_wave_buffer_resource,
2408 src_thread_addr_offset,
2409 0,
2410 src_linear_addr_offset,
2411 is_valid_element,
2413}
2414
2415// This version support buffer resource as input arg
2416template <typename T,
2417 index_t N,
2419 bool oob_conditional_check = true,
2420 bool pre_nop = false>
2422 const int32x4_t src_wave_buffer_resource,
2423 index_t src_thread_element_offset,
2424 index_t src_linear_element_offset,
2425 index_t is_valid_element = 0,
2427{
2428 index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
2429 index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
2430
2432 dst,
2433 src_wave_buffer_resource,
2434 src_thread_addr_offset,
2435 0,
2436 src_linear_addr_offset,
2437 is_valid_element,
2439}
2440
2441// unfortunately async copy can not make sure invalid data is zero inside LDS
2442// ... unless people manually write zero to LDS at the proper address.
2443// so not support invalid_element check for now.
2444// buffer_load OOB still working.
2445template <typename T,
2446 index_t N,
2448 bool pre_nop = false>
2450 const T* p_src_wave,
2451 index_t src_thread_element_offset,
2452 index_t src_linear_element_offset,
2453 index_t src_element_space_size,
2455{
2456 const int32x4_t src_wave_buffer_resource =
2457 make_wave_buffer_resource(p_src_wave, src_element_space_size * sizeof(T));
2458
2459 index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
2460 index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
2461
2463 src_wave_buffer_resource,
2464 src_thread_addr_offset,
2465 0,
2466 src_linear_addr_offset,
2468}
2469
2470// This version support buffer resource as input arg
2471template <typename T,
2472 index_t N,
2474 bool pre_nop = false>
2476 const int32x4_t src_wave_buffer_resource,
2477 index_t src_thread_element_offset,
2478 index_t src_linear_element_offset,
2480{
2481 index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
2482 index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
2483
2485 src_wave_buffer_resource,
2486 src_thread_addr_offset,
2487 0,
2488 src_linear_addr_offset,
2490}
2491
2492// This version support buffer resource as input arg
2493template <typename T,
2494 index_t N,
2496 bool oob_conditional_check = false>
2498 const int32x4_t src_wave_buffer_resource,
2499 index_t src_thread_element_offset,
2500 index_t src_linear_element_offset,
2501 bool is_valid_element,
2503{
2504 index_t src_thread_addr_offset = src_thread_element_offset * sizeof(T);
2505 index_t src_linear_addr_offset = src_linear_element_offset * sizeof(T);
2506
2508 src_wave_buffer_resource,
2509 src_thread_addr_offset,
2510 0,
2511 src_linear_addr_offset,
2512 is_valid_element,
2514}
2515
2516// buffer_store requires:
2517// 1) p_dst_wave must point to global memory
2518// 2) p_dst_wave must be a wavewise pointer.
2519// It is user's responsibility to make sure that is true.
2520template <typename T,
2521 index_t N,
2523 bool oob_conditional_check = true>
2524CK_TILE_DEVICE void amd_buffer_store(const thread_buffer<T, N>& src_thread_data,
2525 T* p_dst_wave,
2526 const index_t dst_thread_element_offset,
2527 const bool dst_thread_element_valid,
2528 const index_t dst_element_space_size)
2529{
2530 const int32x4_t dst_wave_buffer_resource =
2531 make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
2532
2533 index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
2534
2535#if CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
2536 uint32_t dst_addr_shift = [&]() {
2537 if constexpr(oob_conditional_check)
2538 return dst_thread_element_valid ? 0 : 0x80000000;
2539 else
2540 return 0;
2541 }();
2543 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
2544#else
2545 if constexpr(oob_conditional_check)
2546 {
2547 if(dst_thread_element_valid)
2548 {
2550 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
2551 }
2552 }
2553 else
2554 {
2556 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
2557 }
2558#endif
2559}
2560
2561template <typename T,
2562 index_t N,
2564 bool oob_conditional_check = true>
2565CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer<T, N>& src_thread_data,
2566 T* p_dst_wave,
2567 const index_t dst_thread_element_offset,
2568 const index_t dst_linear_element_offset,
2569 const bool dst_thread_element_valid,
2570 const index_t dst_element_space_size)
2571{
2572 const int32x4_t dst_wave_buffer_resource =
2573 make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
2574
2575 index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
2576 index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T);
2577
2579 dst_wave_buffer_resource,
2580 dst_thread_addr_offset,
2581 0,
2582 dst_linear_addr_offset,
2583 dst_thread_element_valid);
2584}
2585
2586// buffer_atomic_add requires:
2587// 1) p_dst_wave must point to global memory
2588// 2) p_dst_wave must be a wavewise pointer.
2589// It is user's responsibility to make sure that is true.
2590template <typename T, index_t N>
2591CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer<T, N>& src_thread_data,
2592 T* p_dst_wave,
2593 const index_t dst_thread_element_offset,
2594 const bool dst_thread_element_valid,
2595 const index_t dst_element_space_size)
2596{
2597 const int32x4_t dst_wave_buffer_resource =
2598 make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
2599
2600 index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
2601
2602#if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
2603 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
2604
2606 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
2607#else
2608 if(dst_thread_element_valid)
2609 {
2611 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
2612 }
2613#endif
2614}
2615
2616template <typename T,
2617 index_t N,
2619 bool oob_conditional_check = true,
2620 bool pre_nop = false>
2622 T* p_dst_wave,
2623 const index_t dst_thread_element_offset,
2624 const index_t dst_linear_element_offset,
2625 const bool dst_thread_element_valid,
2626 const index_t dst_element_space_size,
2628{
2629 const int32x4_t dst_wave_buffer_resource =
2630 make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
2631
2632 index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
2633 index_t dst_linear_addr_offset = dst_linear_element_offset * sizeof(T);
2634
2635 if constexpr(oob_conditional_check)
2636 {
2637 buffer_atomic_add_if<T, N, pre_nop>{}(src_thread_data,
2638 dst_wave_buffer_resource,
2639 dst_thread_addr_offset,
2640 0,
2641 dst_linear_addr_offset,
2642 dst_thread_element_valid);
2643 }
2644 else
2645 {
2646 buffer_atomic_add<T, N, pre_nop>{}(src_thread_data,
2647 dst_wave_buffer_resource,
2648 dst_thread_addr_offset,
2649 0,
2650 dst_linear_addr_offset,
2651 1);
2652 }
2653}
2654
2655// buffer_atomic_max requires:
2656// 1) p_dst_wave must point to global memory
2657// 2) p_dst_wave must be a wavewise pointer.
2658// It is user's responsibility to make sure that is true.
2659template <typename T, index_t N>
2660CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer<T, N>& src_thread_data,
2661 T* p_dst_wave,
2662 const index_t dst_thread_element_offset,
2663 const bool dst_thread_element_valid,
2664 const index_t dst_element_space_size)
2665{
2666 const int32x4_t dst_wave_buffer_resource =
2667 make_wave_buffer_resource(p_dst_wave, dst_element_space_size * sizeof(T));
2668
2669 index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
2670
2671#if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
2672 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
2673
2675 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
2676#else
2677 if(dst_thread_element_valid)
2678 {
2680 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
2681 }
2682#endif
2683}
2684
2685template <typename T, index_t NumElemsPerThread>
2686CK_TILE_DEVICE void amd_direct_load_global_to_lds(const T* global_base_ptr,
2687 const index_t global_offset,
2688 T* lds_base_ptr,
2689 const index_t lds_offset,
2690 const bool is_valid,
2691 const index_t src_element_space_size)
2692{
2693 const uint32_t* global_ptr =
2694 reinterpret_cast<uint32_t*>(reinterpret_cast<uintptr_t>(global_base_ptr));
2695 const int32x4_t src_resource =
2696 make_wave_buffer_resource(global_ptr, src_element_space_size * sizeof(T));
2697 const index_t global_offset_bytes = is_valid ? global_offset * sizeof(T) : 0x80000000;
2698
2699#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
2700 T* lds_ptr = lds_base_ptr + lds_offset;
2701 auto const lds_ptr_sgpr = amd_wave_read_first_lane((reinterpret_cast<uintptr_t>(lds_ptr)));
2702 asm volatile("s_mov_b32 m0, %0; \n\t"
2703 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
2704 "v"(global_offset_bytes),
2705 "s"(src_resource)
2706 : "memory");
2707#else
2708 // Direct loads require that each thread reads and writes exactly a single DWORD.
2709#if defined(__gfx9__)
2710 constexpr auto bytes_per_thread = sizeof(T) * NumElemsPerThread;
2711#endif
2712 // Direct loads require that each thread reads and writes a multiple of DWORDs (4 bytes).
2713 // For gfx950: supports 1, 3, or 4 DWORDs per thread
2714 // For gfx942: supports exactly 1 DWORD per thread
2715#if defined(__gfx950__)
2716 constexpr auto dword_bytes = 4;
2717 static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
2718 bytes_per_thread == dword_bytes * 4);
2719#elif defined(__gfx9__)
2720 constexpr auto dword_bytes = 4;
2721 static_assert(bytes_per_thread == dword_bytes);
2722#endif
2723 // LDS pointer must be attributed with the LDS address space.
2724 as3_uint32_ptr lds_ptr =
2725 reinterpret_cast<as3_uint32_ptr>(reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
2726
2728 src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
2729#endif
2730}
2731
2732#if defined(__gfx950__)
2733template <typename T, index_t N>
2734__device__ auto amd_transpose_load_to_vgpr(const T* __restrict__ in_ptr)
2735{
2736#define __LDS_ADDR __attribute__((address_space(3)))
2737
2738 static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
2739 "We need to have the compatible compiler version to build this instruction");
2740
2741#pragma clang diagnostic push
2742#pragma clang diagnostic ignored "-Wold-style-cast"
2743 // Use C-style cast to change address space without dropping llvm noalias attribute
2744 const auto in_ptr_ = (__LDS_ADDR T*)(const_cast<T*>(in_ptr));
2745#pragma clang diagnostic pop
2746 if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::half_t>)
2747 {
2748 typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 llvm_fp16x4_t;
2749 auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_fp16x4_t*>(in_ptr_);
2750 return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4f16(lds_ptr));
2751 }
2752 else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::bf16_t>)
2753 {
2754 typedef __attribute__((__vector_size__(4 * sizeof(__bf16)))) __bf16 llvm_bf16x4_t;
2755 auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_bf16x4_t*>(in_ptr_);
2756 return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr16_b64_v4bf16(lds_ptr));
2757 }
2758 else if constexpr(std::is_same_v<remove_cvref_t<T>, ck_tile::fp8_t> ||
2759 std::is_same_v<remove_cvref_t<T>, ck_tile::bf8_t> ||
2760 std::is_same_v<remove_cvref_t<T>, ck_tile::int8_t>)
2761 {
2762 typedef __attribute__((__vector_size__(2 * sizeof(index_t)))) index_t llvm_i32x2_t;
2763 auto lds_ptr = reinterpret_cast<__LDS_ADDR llvm_i32x2_t*>(in_ptr_);
2764 return bit_cast<thread_buffer<T, N>>(__builtin_amdgcn_ds_read_tr8_b64_v2i32(lds_ptr));
2765 }
2766 else
2767 {
2768 static_assert(false, "not implemented");
2769 }
2770#undef __LDS_ADDR
2771}
2772#endif
2773
2774} // namespace ck_tile
2775
2776#endif // CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
#define CK_TILE_DEVICE
Definition config.hpp:41
#define CK_TILE_BUFFER_RESOURCE_3RD_DWORD
Definition config.hpp:210
#define CK_TILE_DEVICE_EXTERN
Definition config.hpp:43
#define CK_TILE_LDS_ADDR
Definition config.hpp:58
Definition tile/core/arch/amd_buffer_addressing.hpp:110
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 3 >(array< float, 3 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:997
CK_TILE_DEVICE void insert_dummy_dep()
Definition tile/core/arch/amd_buffer_addressing.hpp:1037
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 32 >(array< float, 32 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:1025
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 8 >(array< float, 8 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:1009
CK_TILE_DEVICE void insert_dummy_dep_per_dword(array< float, N > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:981
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 2 >(array< float, 2 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:991
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 4 >(array< float, 4 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:1003
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 16 >(array< float, 16 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:1016
Definition tile/core/algorithm/cluster_descriptor.hpp:13
CK_TILE_DEVICE_EXTERN int8x2_t llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
_Float16 fp16x2_t
Definition half.hpp:385
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:1535
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition type_traits.hpp:21
int8_t int8x2_t
Definition pk_int4.hpp:103
CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2874
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16(_Float16 vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
__device__ uint32_t amd_wave_read_first_lane(uint16_t v)
Definition tile/core/arch/amd_buffer_addressing.hpp:35
_Float16 half_t
Definition half.hpp:111
CK_TILE_DEVICE_EXTERN fp16x4_t llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
uint16_t uint16x2_t
Definition vector_type.hpp:181
int16_t int16x4_t
Definition vector_type.hpp:173
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
CK_TILE_DEVICE_EXTERN int8_t llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8(int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
constant< b > bool_constant
Definition tile/core/numeric/integral_constant.hpp:43
int8_t int8_t
Definition int8.hpp:20
CK_TILE_DEVICE void amd_buffer_store(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2738
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x2(int16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
bfloat16_t bf16_t
Definition bfloat16.hpp:113
CK_TILE_DEVICE auto async_load_fence_raw(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:1068
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x4(int16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
_Float16 fp16_t
Definition half.hpp:110
amd_buffer_coherence_enum
Definition tile/core/arch/amd_buffer_addressing.hpp:1404
@ glc_slc
Definition tile/core/arch/amd_buffer_addressing.hpp:1408
@ SYSTEM_NT1
Definition tile/core/arch/amd_buffer_addressing.hpp:1419
@ coherence_default
Definition tile/core/arch/amd_buffer_addressing.hpp:1405
@ WAVE_NT0
Definition tile/core/arch/amd_buffer_addressing.hpp:1412
@ slc
Definition tile/core/arch/amd_buffer_addressing.hpp:1407
@ DEVICE_NT1
Definition tile/core/arch/amd_buffer_addressing.hpp:1417
@ SYSTEM_NT0
Definition tile/core/arch/amd_buffer_addressing.hpp:1418
@ glc
Definition tile/core/arch/amd_buffer_addressing.hpp:1406
@ GROUP_NT1
Definition tile/core/arch/amd_buffer_addressing.hpp:1415
@ DEVICE_NT0
Definition tile/core/arch/amd_buffer_addressing.hpp:1416
@ GROUP_NT0
Definition tile/core/arch/amd_buffer_addressing.hpp:1414
@ WAVE_NT1
Definition tile/core/arch/amd_buffer_addressing.hpp:1413
CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
_BitInt(8) fp8_t
Definition float8.hpp:204
CK_TILE_DEVICE_EXTERN fp32x2_t llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
CK_TILE_DEVICE void amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool is_valid_element, bool_constant< oob_conditional_check >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:2711
tuple_array< T, N > thread_buffer
Definition thread_buffer.hpp:14
int32_t int32x4_t
Definition vector_type.hpp:155
CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T *smem, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:2663
bfloat16_t bf16x2_t
Definition pk_fp4.hpp:24
CK_TILE_HOST_DEVICE constexpr Y bit_cast(const X &x)
Definition bit_cast.hpp:11
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x2(fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x4(fp32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x4(fp16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
CK_TILE_DEVICE_EXTERN int32x4_t llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
CK_TILE_DEVICE void lds_load_fence(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:820
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer< int8_t, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:1926
CK_TILE_DEVICE_EXTERN bf16x2_t llvm_amdgcn_raw_buffer_atomic_add_bf16x2(bf16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16")
CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void *smem, int32x4_t rsrc, index_t voffset, index_t, index_t ioffset, index_t=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:1352
_Float16 fp16x4_t
Definition vector_type.hpp:137
CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
CK_TILE_DEVICE void amd_buffer_atomic_max_impl(const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:2466
CK_TILE_DEVICE void amd_buffer_atomic_add_raw(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:2835
CK_TILE_DEVICE_EXTERN int32x2_t llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
constant< v > number
Definition tile/core/numeric/integral_constant.hpp:37
CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer< T, N > &src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:2272
constexpr detail::ignore_t ignore
Definition tile/core/utility/ignore.hpp:20
CK_TILE_DEVICE void buffer_store_fence(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:1063
CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
bfloat16_t bf16x4_t
Definition vector_type.hpp:146
int32_t int32_t
Definition integer.hpp:10
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer< T, N > &dst, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, index_t is_valid_element=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:2605
bfloat16_t bf16x8_t
Definition vector_type.hpp:147
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_zero(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2542
CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:1393
CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2779
CK_TILE_DEVICE_EXTERN fp32x4_t llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
CK_TILE_DEVICE void buffer_load_fence(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:815
typename impl::ext_vector< T, N >::type ext_vector_t
Definition vector_type.hpp:84
unsigned _BitInt(8) bf8_t
Definition float8.hpp:206
CK_TILE_DEVICE_EXTERN int16_t llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
CK_TILE_DEVICE_EXTERN int16x4_t llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:2026
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x2(fp32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
float fp32x4_t
Definition vector_type.hpp:128
CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void *ptr, uint32_t size=0xffffffff, ForceSGPR={})
Definition tile/core/arch/amd_buffer_addressing.hpp:97
CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2805
uint16_t uint16x4_t
Definition vector_type.hpp:182
float fp32x2_t
Definition pk_fp4.hpp:22
int8_t int8x4_t
Definition vector_type.hpp:191
CK_TILE_DEVICE_EXTERN int16x2_t llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, index_t flag=0, bool_constant< oob_conditional_check >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:1882
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x2(uint16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
int32_t index_t
Definition integer.hpp:9
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition tile/core/arch/amd_buffer_addressing.hpp:2580
int32_t int32x2_t
Definition vector_type.hpp:154
CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
CK_TILE_DEVICE_EXTERN _Float16 llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x4(uint16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
CK_TILE_DEVICE thread_buffer< int8_t, N > amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:1425
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, as3_uint32_ptr lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
CK_TILE_DEVICE void amd_async_buffer_load_impl(CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:1857
CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16(int16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
int16_t int16x2_t
Definition vector_type.hpp:172
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer< T, N > &dst_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset, index_t dst_linear_addr_offset, index_t is_valid_element=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:2240
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16(uint16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
CK_TILE_DEVICE_EXTERN int8x4_t llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer< T, N > &dst, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_linear_addr_offset, index_t flag=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:1818
__device__ void amd_direct_load_global_to_lds(const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
Definition utility/amd_buffer_addressing.hpp:1015
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
signed short int16_t
Definition stdint.h:122
unsigned short uint16_t
Definition stdint.h:125
_W64 unsigned int uintptr_t
Definition stdint.h:164
unsigned int uint32_t
Definition stdint.h:126
unsigned char uint8_t
Definition stdint.h:124
Definition tile/core/arch/amd_buffer_addressing.hpp:826
Definition tile/core/arch/amd_buffer_addressing.hpp:857
Definition tile/core/arch/amd_buffer_addressing.hpp:134
Definition tile/core/arch/amd_buffer_addressing.hpp:131
Definition tile/core/arch/amd_buffer_addressing.hpp:90
Definition tile/core/arch/amd_buffer_addressing.hpp:140
Definition tile/core/arch/amd_buffer_addressing.hpp:137
fp32x4_t payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:115
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:119
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:118
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:117
fp32x2_t payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:116
Definition tile/core/arch/amd_buffer_addressing.hpp:113
fp32x4_t payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:884
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:888
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:886
fp32x2_t payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:885
Definition tile/core/arch/amd_buffer_addressing.hpp:882
static CK_TILE_HOST_DEVICE constexpr T zero()
Definition tile/core/numeric/numeric.hpp:58
Definition coordinate_transform.hpp:1392
Definition tile/core/arch/amd_buffer_addressing.hpp:895
Definition tile/core/utility/functional.hpp:43
Definition tile/core/utility/debug.hpp:67
uint32_t * as3_uint32_ptr
Definition tile/core/arch/amd_buffer_addressing.hpp:29
#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr)