From 255eaa6faa1dda6783e6f3ec182a5ac776fd41c5 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Tue, 9 Oct 2018 14:49:00 +0000
Subject: [OPENMP][NVPTX] Support memory coalescing for globalized variables.

Added support for memory coalescing for better performance for
globalized variables. From now on all the globalized variables are
represented as arrays of 32 elements and each thread accesses these
elements using `tid & 31` as index.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@344049 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp')

diff --git a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
index 2b18f6d3f9..2a7344f773 100644
--- a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
+++ b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
@@ -26,10 +26,13 @@ int main(int argc, char **argv) {
 // CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_main_l17_worker(
 
 // CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}})
-// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 84, i16 0)
+// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 2688, i16 0)
 // CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty*
 // CHECK: [[ARGC:%.+]] = load i32, i32* %{{.+}}, align
-// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[ARGC_ARR_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0
+// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+// CHECK: [[LID:%.+]] = and i32 [[TID]], 31
+// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGC_ARR_ADDR]], i32 0, i32 [[LID]]
 // CHECK: store i32 [[ARGC]], i32* [[ARGC_ADDR]],
 // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 1
 // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 2
-- 
cgit v1.2.3


From c0a2732d30f9ffb73b9b449f2c1d4c858f153dfe Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Thu, 11 Oct 2018 18:30:31 +0000
Subject: [OPENMP][NVPTX]Reduce memory use for globalized vars in
 target/teams/distribute regions.

Previously introduced globalization scheme that uses memory coalescing
scheme may increase memory usage fr the variables that are devlared in
target/teams/distribute contexts. We don't need 32 copies of such
variables, just 1. Patch reduces memory use in this case.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@344273 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp')

diff --git a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
index 2a7344f773..2b18f6d3f9 100644
--- a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
+++ b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
@@ -26,13 +26,10 @@ int main(int argc, char **argv) {
 // CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_main_l17_worker(
 
 // CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}})
-// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 2688, i16 0)
+// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 84, i16 0)
 // CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty*
 // CHECK: [[ARGC:%.+]] = load i32, i32* %{{.+}}, align
-// CHECK: [[ARGC_ARR_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0
-// CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-// CHECK: [[LID:%.+]] = and i32 [[TID]], 31
-// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds [32 x i32], [32 x i32]* [[ARGC_ARR_ADDR]], i32 0, i32 [[LID]]
+// CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0
 // CHECK: store i32 [[ARGC]], i32* [[ARGC_ADDR]],
 // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 1
 // CHECK: getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 2
-- 
cgit v1.2.3


From 40a12dcd50f1ff99c81616b47662a85c60d8d54d Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Fri, 2 Nov 2018 14:54:07 +0000
Subject: [OPENMP][NVPTX]Improve emission of the globalized variables for
 target/teams/distribute regions.

Target/teams/distribute regions exist for all the time the kernel is
executed. Thus, if the variable is declared in their context and then
escape it, we can allocate global memory statically instead of
allocating it dynamically.
Patch captures all the globalized variables in target/teams/distribute
contexts, merges them into the records, one per each target region.
Those records are then joined into the union, one per compilation unit
(to save the global memory). Those units are organized into
2 x dimensional arrays, where the first dimension is
the number of blocks per SM and the second one is the number of SMs.
Runtime functions manage this global memory space between the executing
teams.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@345978 91177308-0d34-0410-b5e6-96231b3b80d8
---
 .../nvptx_distribute_parallel_generic_mode_codegen.cpp      | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp')

diff --git a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
index 2b18f6d3f9..c41964517b 100644
--- a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
+++ b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
@@ -21,12 +21,19 @@ int main(int argc, char **argv) {
   return 0;
 }
 
-// CHECK: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 1
+// CHECK: [[MEM_TY:%.+]] = type { [84 x i8] }
+// CHECK-DAG: [[GLOBAL_RD:@.+]] = weak global [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]] zeroinitializer
+// CHECK-DAG: [[GLOBAL_RD_PTR:@.+]] = weak unnamed_addr constant i8* getelementptr inbounds ([{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]], [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]]* [[GLOBAL_RD]], i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0)
+// CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
+// CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 84
+// CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 1
 
 // CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_main_l17_worker(
 
 // CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}})
-// CHECK: [[PTR:%.+]] = call i8* @__kmpc_data_sharing_push_stack(i{{64|32}} 84, i16 0)
+// CHECK: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]],
+// CHECK: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i{{64|32}} 84, i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
+// CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
 // CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty*
 // CHECK: [[ARGC:%.+]] = load i32, i32* %{{.+}}, align
 // CHECK: [[ARGC_ADDR:%.+]] = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* [[STACK]], i{{32|64}} 0, i{{32|64}} 0
@@ -41,7 +48,7 @@ int main(int argc, char **argv) {
 
 // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @
 
-// CHECK: call void @__kmpc_data_sharing_pop_stack(i8* [[PTR]])
+// CHECK: call void @__kmpc_restore_team_static_memory(i16 0)
 
 // CHECK: define internal void [[PARALLEL]](
 // CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack(
-- 
cgit v1.2.3


From 223835dbb1f8b240a8262100dbef25b02e6558c7 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Fri, 9 Nov 2018 16:18:04 +0000
Subject: [OPENMP][NVPTX]Allow to use shared memory for the
 target|teams|distribute variables.

If the total size of the variables, declared in target|teams|distribute
regions, is less than the maximal size of shared memory available, the
buffer is allocated in the shared memory.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@346507 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp')

diff --git a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
index c41964517b..71a3ad5491 100644
--- a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
+++ b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
@@ -22,8 +22,7 @@ int main(int argc, char **argv) {
 }
 
 // CHECK: [[MEM_TY:%.+]] = type { [84 x i8] }
-// CHECK-DAG: [[GLOBAL_RD:@.+]] = weak global [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]] zeroinitializer
-// CHECK-DAG: [[GLOBAL_RD_PTR:@.+]] = weak unnamed_addr constant i8* getelementptr inbounds ([{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]], [{{[0-9]+}} x [{{[0-9]+}} x [[MEM_TY]]]]* [[GLOBAL_RD]], i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0, i{{[0-9]+}} 0)
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
 // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
 // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 84
 // CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 1
@@ -31,8 +30,7 @@ int main(int argc, char **argv) {
 // CHECK-LABEL: define internal void @__omp_offloading_{{.*}}_main_l17_worker(
 
 // CHECK: define weak void @__omp_offloading_{{.*}}_main_l17([10 x i32]* dereferenceable(40) %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}}, i32* dereferenceable(4) %{{.+}}, i{{64|32}} %{{.+}}, [10 x i32]* dereferenceable(40) %{{.+}})
-// CHECK: [[GLOBAL_RD:%.+]] = load i8*, i8** [[GLOBAL_RD_PTR]],
-// CHECK: call void @__kmpc_get_team_static_memory(i8* [[GLOBAL_RD]], i{{64|32}} 84, i16 0, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
+// CHECK: call void @__kmpc_get_team_static_memory(i8* addrspacecast (i8 addrspace(3)* getelementptr inbounds ([[MEM_TY]], [[MEM_TY]] addrspace(3)* [[SHARED_GLOBAL_RD]], i32 0, i32 0, i32 0) to i8*), i{{64|32}} 84, i16 1, i8** addrspacecast (i8* addrspace(3)* [[KERNEL_PTR]] to i8**))
 // CHECK: [[PTR:%.+]] = load i8*, i8* addrspace(3)* [[KERNEL_PTR]],
 // CHECK: [[STACK:%.+]] = bitcast i8* [[PTR]] to %struct._globalized_locals_ty*
 // CHECK: [[ARGC:%.+]] = load i32, i32* %{{.+}}, align
@@ -48,7 +46,7 @@ int main(int argc, char **argv) {
 
 // CHECK: call void @__kmpc_for_static_fini(%struct.ident_t* @
 
-// CHECK: call void @__kmpc_restore_team_static_memory(i16 0)
+// CHECK: call void @__kmpc_restore_team_static_memory(i16 1)
 
 // CHECK: define internal void [[PARALLEL]](
 // CHECK-NOT: call i8* @__kmpc_data_sharing_push_stack(
-- 
cgit v1.2.3


From f7b7bb1775b393a376b74dc23be4e265b2bfb317 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Fri, 16 Nov 2018 19:38:21 +0000
Subject: [OPENMP][NVPTX]Emit correct reduction code for teams/parallel
 reductions.

Fixed previously committed code for the reduction support in
teams/parallel constructs taking into account new design of the NVPTX
support in the compiler. Teams reduction are not fully functional yet,
it is going to be fixed in the following patches.

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@347081 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp')

diff --git a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
index 71a3ad5491..a84962c4d8 100644
--- a/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
+++ b/test/OpenMP/nvptx_distribute_parallel_generic_mode_codegen.cpp
@@ -22,7 +22,7 @@ int main(int argc, char **argv) {
 }
 
 // CHECK: [[MEM_TY:%.+]] = type { [84 x i8] }
-// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = weak addrspace(3) global [[MEM_TY]] zeroinitializer
+// CHECK-DAG: [[SHARED_GLOBAL_RD:@.+]] = common addrspace(3) global [[MEM_TY]] zeroinitializer
 // CHECK-DAG: [[KERNEL_PTR:@.+]] = internal addrspace(3) global i8* null
 // CHECK-DAG: [[KERNEL_SIZE:@.+]] = internal unnamed_addr constant i{{64|32}} 84
 // CHECK-DAG: @__omp_offloading_{{.*}}_main_l17_exec_mode = weak constant i8 1
-- 
cgit v1.2.3