[PowerPC] Optimize passing certain aggregates by value

In addition to enabling ELFv2 homogeneous aggregate handling, LLVM support to pass array types directly also enables a performance enhancement. We can now pass (non-homogeneous) aggregates that fit fully in registers as direct integer arrays, using an element type to encode the alignment requirement (that would otherwise go to the "byval align" field). This is preferable since "byval" forces the back-end to write the aggregate out to the stack, even if it could be passed fully in registers. This is particularly annoying on ELFv2, if there is no parameter save area available, since we then need to allocate space on the callee's stack just to hold those aggregates. Note that to implement this optimization, this patch does not attempt to fully anticipate register allocation rules as (defined in the ABI and) implemented in the back-end. Instead, the patch is simply passing *any* aggregate passed by value using the array mechanism if its size is up to 64 bytes. This means that some of those will end up being passed in stack slots anyway, but the generated code shouldn't be any worse either. (*Large* aggregates remain passed using "byval" to enable optimized copying via memcpy etc.) git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@213495 91177308-0d34-0410-b5e6-96231b3b80d8
author: Ulrich Weigand <ulrich.weigand@de.ibm.com> 2014-07-21 00:56:36 +0000
committer: Ulrich Weigand <ulrich.weigand@de.ibm.com> 2014-07-21 00:56:36 +0000
commit: 8dae99b9e139c485973af77abc5a6b23d81b9248 (patch)
tree: 905e94b94a2db7e381fc203ea2f9dfcbd3c222b1
parent: 3e86dfcf1398b8c7f9f8a595c500f4b12a574d26 (diff)
4 files changed, 57 insertions, 13 deletions
diff --git a/lib/CodeGen/TargetInfo.cpp b/lib/CodeGen/TargetInfo.cpp
index 2ed33b00bc..5da22c3e6c 100644
--- a/lib/CodeGen/TargetInfo.cpp
+++ b/lib/CodeGen/TargetInfo.cpp
@@ -3178,6 +3178,31 @@ PPC64_SVR4_ABIInfo::classifyArgumentType(QualType Ty) const {
       return ABIArgInfo::getDirect(CoerceTy);
     }
 
+    // If an aggregate may end up fully in registers, we do not
+    // use the ByVal method, but pass the aggregate as array.
+    // This is usually beneficial since we avoid forcing the
+    // back-end to store the argument to memory.
+    uint64_t Bits = getContext().getTypeSize(Ty);
+    if (Bits > 0 && Bits <= 8 * GPRBits) {
+      llvm::Type *CoerceTy;
+
+      // Types up to 8 bytes are passed as integer type (which will be
+      // properly aligned in the argument save area doubleword).
+      if (Bits <= GPRBits)
+        CoerceTy = llvm::IntegerType::get(getVMContext(),
+                                          llvm::RoundUpToAlignment(Bits, 8));
+      // Larger types are passed as arrays, with the base type selected
+      // according to the required alignment in the save area.
+      else {
+        uint64_t RegBits = ABIAlign * 8;
+        uint64_t NumRegs = llvm::RoundUpToAlignment(Bits, RegBits) / RegBits;
+        llvm::Type *RegTy = llvm::IntegerType::get(getVMContext(), RegBits);
+        CoerceTy = llvm::ArrayType::get(RegTy, NumRegs);
+      }
+
+      return ABIArgInfo::getDirect(CoerceTy);
+    }
+
     // All other aggregates are passed ByVal.
     return ABIArgInfo::getIndirect(ABIAlign, /*ByVal=*/true,
                                    /*Realign=*/TyAlign > ABIAlign);
diff --git a/test/CodeGen/ppc64-align-struct.c b/test/CodeGen/ppc64-align-struct.c
index f820d2fbba..a50c849b45 100644
--- a/test/CodeGen/ppc64-align-struct.c
+++ b/test/CodeGen/ppc64-align-struct.c
@@ -6,30 +6,48 @@ struct test1 { int x; int y; };
 struct test2 { int x; int y; } __attribute__((aligned (16)));
 struct test3 { int x; int y; } __attribute__((aligned (32)));
 struct test4 { int x; int y; int z; };
+struct test5 { int x[17]; };
+struct test6 { int x[17]; } __attribute__((aligned (16)));
+struct test7 { int x[17]; } __attribute__((aligned (32)));
 
-// CHECK: define void @test1(i32 signext %x, %struct.test1* byval align 8 %y)
+// CHECK: define void @test1(i32 signext %x, i64 %y.coerce)
 void test1 (int x, struct test1 y)
 {
 }
 
-// CHECK: define void @test2(i32 signext %x, %struct.test2* byval align 16 %y)
+// CHECK: define void @test2(i32 signext %x, [1 x i128] %y.coerce)
 void test2 (int x, struct test2 y)
 {
 }
 
-// This case requires run-time realignment of the incoming struct
-// CHECK: define void @test3(i32 signext %x, %struct.test3* byval align 16)
-// CHECK: %y = alloca %struct.test3, align 32
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+// CHECK: define void @test3(i32 signext %x, [2 x i128] %y.coerce)
 void test3 (int x, struct test3 y)
 {
 }
 
-// CHECK: define void @test4(i32 signext %x, %struct.test4* byval align 8 %y)
+// CHECK: define void @test4(i32 signext %x, [2 x i64] %y.coerce)
 void test4 (int x, struct test4 y)
 {
 }
 
+// CHECK: define void @test5(i32 signext %x, %struct.test5* byval align 8 %y)
+void test5 (int x, struct test5 y)
+{
+}
+
+// CHECK: define void @test6(i32 signext %x, %struct.test6* byval align 16 %y)
+void test6 (int x, struct test6 y)
+{
+}
+
+// This case requires run-time realignment of the incoming struct
+// CHECK: define void @test7(i32 signext %x, %struct.test7* byval align 16)
+// CHECK: %y = alloca %struct.test7, align 32
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64
+void test7 (int x, struct test7 y)
+{
+}
+
 // CHECK: define void @test1va(%struct.test1* noalias sret %agg.result, i32 signext %x, ...)
 // CHECK: %[[CUR:[^ ]+]] = load i8** %ap
 // CHECK: %[[NEXT:[^ ]+]] = getelementptr i8* %[[CUR]], i64 8
diff --git a/test/CodeGen/ppc64-vector.c b/test/CodeGen/ppc64-vector.c
index 3ff07a4d41..f0211f0ec1 100644
--- a/test/CodeGen/ppc64-vector.c
+++ b/test/CodeGen/ppc64-vector.c
@@ -45,7 +45,7 @@ v16i16 test_v16i16(v16i16 x)
   return x;
 }
 
-// CHECK: define void @test_struct_v16i16(%struct.v16i16* noalias sret %agg.result, %struct.v16i16* byval align 16)
+// CHECK: define void @test_struct_v16i16(%struct.v16i16* noalias sret %agg.result, [2 x i128] %x.coerce)
 struct v16i16 test_struct_v16i16(struct v16i16 x)
 {
   return x;
diff --git a/test/CodeGen/ppc64le-aggregates.c b/test/CodeGen/ppc64le-aggregates.c
index cb19dd31f2..acf34a8a80 100644
--- a/test/CodeGen/ppc64le-aggregates.c
+++ b/test/CodeGen/ppc64le-aggregates.c
@@ -40,7 +40,7 @@ struct f7 func_f7(struct f7 x) { return x; }
 // CHECK: define [8 x float] @func_f8([8 x float] %x.coerce)
 struct f8 func_f8(struct f8 x) { return x; }
 
-// CHECK: define void @func_f9(%struct.f9* noalias sret %agg.result, %struct.f9* byval align 8 %x)
+// CHECK: define void @func_f9(%struct.f9* noalias sret %agg.result, [5 x i64] %x.coerce)
 struct f9 func_f9(struct f9 x) { return x; }
 
 // CHECK: define [2 x float] @func_fab([2 x float] %x.coerce)
@@ -98,10 +98,11 @@ struct f8 global_f8;
 void call_f8(void) { global_f8 = func_f8(global_f8); }
 
 // CHECK-LABEL: @call_f9
-// CHECK: %[[TMP1:[^ ]+]] = alloca %struct.f9, align 8
-// CHECK: %[[TMP2:[^ ]+]] = bitcast %struct.f9* %[[TMP1]] to i8*
-// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f9* @global_f9 to i8*), i64 36, i32 4, i1 false)
-// CHECK: call void @func_f9(%struct.f9* sret %{{[^ ]+}}, %struct.f9* byval align 8 %[[TMP1]])
+// CHECK: %[[TMP1:[^ ]+]] = alloca [5 x i64]
+// CHECK: %[[TMP2:[^ ]+]] = bitcast [5 x i64]* %[[TMP1]] to i8*
+// CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %[[TMP2]], i8* bitcast (%struct.f9* @global_f9 to i8*), i64 36, i32 1, i1 false)
+// CHECK: %[[TMP3:[^ ]+]] = load [5 x i64]* %[[TMP1]]
+// CHECK: call void @func_f9(%struct.f9* sret %{{[^ ]+}}, [5 x i64] %[[TMP3]])
 struct f9 global_f9;
 void call_f9(void) { global_f9 = func_f9(global_f9); }
author	Ulrich Weigand <ulrich.weigand@de.ibm.com>	2014-07-21 00:56:36 +0000
committer	Ulrich Weigand <ulrich.weigand@de.ibm.com>	2014-07-21 00:56:36 +0000
commit	8dae99b9e139c485973af77abc5a6b23d81b9248 (patch)
tree	905e94b94a2db7e381fc203ea2f9dfcbd3c222b1
parent	3e86dfcf1398b8c7f9f8a595c500f4b12a574d26 (diff)