diff options
author | Artem Belevich <tra@google.com> | 2019-04-25 22:28:09 +0000 |
---|---|---|
committer | Artem Belevich <tra@google.com> | 2019-04-25 22:28:09 +0000 |
commit | eee685e5e1a9e12245a2a20bfd44db99ca63f8ce (patch) | |
tree | e10b3a59b289cc91b1c9e1725fb39175f7721e05 /include | |
parent | 5e1ea7eba87adbe43fed5746bcbba41d7cff1bf2 (diff) |
[CUDA] Implemented _[bi]mma* builtins.
These builtins provide access to the new integer and
sub-integer variants of MMA (matrix multiply-accumulate) instructions
provided by CUDA-10.x on sm_75 (AKA Turing) GPUs.
Also added a feature for PTX 6.4. While Clang/LLVM does not generate
any PTX instructions that need it, we still need to pass it through to
ptxas in order to be able to compile code that uses the new 'mma'
instruction as inline assembly (e.g used by NVIDIA's CUTLASS library
https://github.com/NVIDIA/cutlass/blob/master/cutlass/arch/mma.h#L101)
Differential Revision: https://reviews.llvm.org/D60279
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@359248 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'include')
-rw-r--r-- | include/clang/Basic/BuiltinsNVPTX.def | 58 |
1 files changed, 55 insertions, 3 deletions
diff --git a/include/clang/Basic/BuiltinsNVPTX.def b/include/clang/Basic/BuiltinsNVPTX.def index b15ed2666b..70be6182c7 100644 --- a/include/clang/Basic/BuiltinsNVPTX.def +++ b/include/clang/Basic/BuiltinsNVPTX.def @@ -18,13 +18,22 @@ #endif #pragma push_macro("SM_70") -#define SM_70 "sm_70|sm_71" +#pragma push_macro("SM_72") +#pragma push_macro("SM_75") +#define SM_75 "sm_75" +#define SM_72 "sm_72|" SM_75 +#define SM_70 "sm_70|" SM_72 + #pragma push_macro("SM_60") #define SM_60 "sm_60|sm_61|sm_62|" SM_70 -#pragma push_macro("PTX61") -#define PTX61 "ptx61" #pragma push_macro("PTX60") +#pragma push_macro("PTX61") +#pragma push_macro("PTX63") +#pragma push_macro("PTX64") +#define PTX64 "ptx64" +#define PTX63 "ptx63|" PTX64 +#define PTX61 "ptx61|" PTX63 #define PTX60 "ptx60|" PTX61 #pragma push_macro("AND") @@ -666,10 +675,53 @@ TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f16, "vf*iC*iC*iC*IiIi", "", AND(SM_70,PTX TARGET_BUILTIN(__hmma_m8n32k16_mma_f32f32, "vf*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) TARGET_BUILTIN(__hmma_m8n32k16_mma_f16f32, "vi*iC*iC*fC*IiIi", "", AND(SM_70,PTX61)) +// Builtins to support integer and sub-integer WMMA instructions on sm_72/sm_75 +TARGET_BUILTIN(__bmma_m8n8k128_ld_a_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__bmma_m8n8k128_ld_b_b1, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__bmma_m8n8k128_ld_c, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__bmma_m8n8k128_mma_xor_popc_b1, "vi*iC*iC*iC*Ii", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__bmma_m8n8k128_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m16n16k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m32n8k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_a_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_a_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_b_s8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_b_u8, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_ld_c, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_mma_s8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_mma_u8, "vi*iC*iC*iC*IiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n32k16_st_c_i32, "vi*iC*UiIi", "", AND(SM_72,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_a_s4, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_a_u4, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_b_s4, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_b_u4, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_ld_c, "vi*iC*UiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_mma_s4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_mma_u4, "vi*iC*iC*iC*IiIi", "", AND(SM_75,PTX63)) +TARGET_BUILTIN(__imma_m8n8k32_st_c_i32, "vi*iC*UiIi", "", AND(SM_75,PTX63)) + #undef BUILTIN #undef TARGET_BUILTIN #pragma pop_macro("AND") #pragma pop_macro("SM_60") #pragma pop_macro("SM_70") +#pragma pop_macro("SM_72") +#pragma pop_macro("SM_75") #pragma pop_macro("PTX60") #pragma pop_macro("PTX61") +#pragma pop_macro("PTX63") +#pragma pop_macro("PTX64") |