From 255eaa6faa1dda6783e6f3ec182a5ac776fd41c5 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 9 Oct 2018 14:49:00 +0000 Subject: [OPENMP][NVPTX] Support memory coalescing for globalized variables. Added support for memory coalescing for better performance for globalized variables. From now on all the globalized variables are represented as arrays of 32 elements and each thread accesses these elements using `tid & 31` as index. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@344049 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp') diff --git a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp index 2b18f6d3f9..2a7344f773 100644 --- a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -26,10 +26,13 @@ int main(int argc, char **argv) { // CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_main_l17_worker( // CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}) -// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 84, i16 0) +// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 2688, i16 0) // CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty* // CHECK: [[ARGC:%.+]] = load i32, i32* %{{.+}}, align -// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[ARGC_ARR_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 +// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() +// CHECK: [[LID:%.+]] = and i32 [[TID]], 31 +// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGC_ARR_ADDR]], i32 0, i32 [[LID]] // CHECK: store i32 [[ARGC]], i32* [[ARGC_ADDR]], // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 1 // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 2 -- cgit v1.2.3 From c0a2732d30f9ffb73b9b449f2c1d4c858f153dfe Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Thu, 11 Oct 2018 18:30:31 +0000 Subject: [OPENMP][NVPTX]Reduce memory use for globalized vars in target/teams/distribute regions. Previously introduced globalization scheme that uses memory coalescing scheme may increase memory usage fr the variables that are devlared in target/teams/distribute contexts. We don't need 32 copies of such variables, just 1. Patch reduces memory use in this case. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@344273 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp') diff --git a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp index 2a7344f773..2b18f6d3f9 100644 --- a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -26,13 +26,10 @@ int main(int argc, char **argv) { // CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_main_l17_worker( // CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}) -// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 2688, i16 0) +// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 84, i16 0) // CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty* // CHECK: [[ARGC:%.+]] = load i32, i32* %{{.+}}, align -// CHECK: [[ARGC_ARR_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 -// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x() -// CHECK: [[LID:%.+]] = and i32 [[TID]], 31 -// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGC_ARR_ADDR]], i32 0, i32 [[LID]] +// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 // CHECK: store i32 [[ARGC]], i32* [[ARGC_ADDR]], // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 1 // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 2 -- cgit v1.2.3 From 40a12dcd50f1ff99c81616b47662a85c60d8d54d Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 2 Nov 2018 14:54:07 +0000 Subject: [OPENMP][NVPTX]Improve emission of the globalized variables for target/teams/distribute regions. Target/teams/distribute regions exist for all the time the kernel is executed. Thus, if the variable is declared in their context and then escape it, we can allocate global memory statically instead of allocating it dynamically. Patch captures all the globalized variables in target/teams/distribute contexts, merges them into the records, one per each target region. Those records are then joined into the union, one per compilation unit (to save the global memory). Those units are organized into 2 x dimensional arrays, where the first dimension is the number of blocks per SM and the second one is the number of SMs. Runtime functions manage this global memory space between the executing teams. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@345978 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../nvptx_distribute_parallel_generic_mode_codegen.cpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp') diff --git a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp index 2b18f6d3f9..c41964517b 100644 --- a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -21,12 +21,19 @@ int main(int argc, char **argv) { return 0; } -// CHECK: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 1 +// CHECK: [[MEM_TY:%.+]] = type { [84 x i8] } +// CHECK-DAG: [[GLOBAL_RD:@.+]] = weak global [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]] zeroinitializer +// CHECK-DAG: [[GLOBAL_RD_PTR:@.+]] = weak unnamed_addr constant i8* getelementptr inbounds ([{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]], [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]]* [[GLOBAL_RD]], i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0) +// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null +// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 84 +// CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 1 // CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_main_l17_worker( // CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}) -// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 84, i16 0) +// CHECK: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]], +// CHECK: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i{{64|32}} 84, i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty* // CHECK: [[ARGC:%.+]] = load i32, i32* %{{.+}}, align // CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0 @@ -41,7 +48,7 @@ int main(int argc, char **argv) { // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @ -// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[PTR]]) +// CHECK: call void @__kmpc_restore_team_static_memory(i16 0) // CHECK: define internal void [[PARALLEL]]( // CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( -- cgit v1.2.3 From 223835dbb1f8b240a8262100dbef25b02e6558c7 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 9 Nov 2018 16:18:04 +0000 Subject: [OPENMP][NVPTX]Allow to use shared memory for the target|teams|distribute variables. If the total size of the variables, declared in target|teams|distribute regions, is less than the maximal size of shared memory available, the buffer is allocated in the shared memory. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@346507 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp') diff --git a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp index c41964517b..71a3ad5491 100644 --- a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -22,8 +22,7 @@ int main(int argc, char **argv) { } // CHECK: [[MEM_TY:%.+]] = type { [84 x i8] } -// CHECK-DAG: [[GLOBAL_RD:@.+]] = weak global [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]] zeroinitializer -// CHECK-DAG: [[GLOBAL_RD_PTR:@.+]] = weak unnamed_addr constant i8* getelementptr inbounds ([{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]], [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]]* [[GLOBAL_RD]], i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0) +// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 84 // CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 1 @@ -31,8 +30,7 @@ int main(int argc, char **argv) { // CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_main_l17_worker( // CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}) -// CHECK: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]], -// CHECK: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i{{64|32}} 84, i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) +// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 84, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**)) // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]], // CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty* // CHECK: [[ARGC:%.+]] = load i32, i32* %{{.+}}, align @@ -48,7 +46,7 @@ int main(int argc, char **argv) { // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @ -// CHECK: call void @__kmpc_restore_team_static_memory(i16 0) +// CHECK: call void @__kmpc_restore_team_static_memory(i16 1) // CHECK: define internal void [[PARALLEL]]( // CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack( -- cgit v1.2.3 From f7b7bb1775b393a376b74dc23be4e265b2bfb317 Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 16 Nov 2018 19:38:21 +0000 Subject: [OPENMP][NVPTX]Emit correct reduction code for teams/parallel reductions. Fixed previously committed code for the reduction support in teams/parallel constructs taking into account new design of the NVPTX support in the compiler. Teams reduction are not fully functional yet, it is going to be fixed in the following patches. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@347081 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp') diff --git a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp index 71a3ad5491..a84962c4d8 100644 --- a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp +++ b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp @@ -22,7 +22,7 @@ int main(int argc, char **argv) { } // CHECK: [[MEM_TY:%.+]] = type { [84 x i8] } -// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer +// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 84 // CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 1 -- cgit v1.2.3