First major step toward addressing PR14059. This teaches SROA to handle

cases where we have partial integer loads and stores to an otherwise promotable alloca to widen[1] those loads and stores to cover the entire alloca and bitcast them into the appropriate type such that promotion can proceed. These partial loads and stores stem from an annoying confluence of ARM's calling convention and ABI lowering and the FCA pre-splitting which takes place in SROA. Clang lowers a { double, double } in-register function argument as a [4 x i32] function argument to ensure it is placed into integer 32-bit registers (a really unnerving implicit contract between Clang and the ARM backend I would add). This results in a FCA load of [4 x i32]* from the { double, double } alloca, and SROA decomposes this into a sequence of i32 loads and stores. Inlining proceeds, code gets folded, but at the end of the day, we still have i32 stores to the low and high halves of a double alloca. Widening these to be i64 operations, and bitcasting them to double prior to loading or storing allows promotion to proceed for these allocas. I looked quite a bit changing the IR which Clang produces for this case to be more friendly, but small changes seem unlikely to help. I think the best representation we could use currently would be to pass 4 i32 arguments thereby avoiding any FCAs, but that would still require this fix. It seems like it might eventually be nice to somehow encode the ABI register selection choices outside of the parameter type system so that the parameter can be a { double, double }, but the CC register annotations indicate that this should be passed via 4 integer registers. This patch does not address the second problem in PR14059, which is the reverse: when a struct alloca is loaded as a *larger* single integer. This patch also does not address some of the code quality issues with the FCA-splitting. Those don't actually impede any optimizations really, but they're on my list to clean up. [1]: Pedantic footnote: for those concerned about memory model issues here, this is safe. For the alloca to be promotable, it cannot escape or have any use of its address that could allow these loads or stores to be racing. Thus, widening is always safe. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@165928 91177308-0d34-0410-b5e6-96231b3b80d8
author: Chandler Carruth <chandlerc@gmail.com> 2012-10-15 08:40:30 +0000
committer: Chandler Carruth <chandlerc@gmail.com> 2012-10-15 08:40:30 +0000
commit: 81ff90db4403f9b4ee3574f00ebdfba4a12177c9 (patch)
tree: f4e8b9d46e61feb969d555c76cb114a3095befce /test/Transforms/SROA
parent: 11cb6ba5d0fe65c7249ef94c83df294b88d575f0 (diff)
download: llvm-81ff90db4403f9b4ee3574f00ebdfba4a12177c9.tar.gz
llvm-81ff90db4403f9b4ee3574f00ebdfba4a12177c9.tar.bz2
llvm-81ff90db4403f9b4ee3574f00ebdfba4a12177c9.tar.xz
2 files changed, 57 insertions, 23 deletions
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
index 7d8e5cdf86..4a87d911e2 100644
--- a/test/Transforms/SROA/basictest.ll
+++ b/test/Transforms/SROA/basictest.ll
@@ -409,8 +409,11 @@ declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
 
 define i16 @test5() {
 ; CHECK: @test5
-; CHECK: alloca float
-; CHECK: ret i16 %
+; CHECK-NOT: alloca float
+; CHECK:      %[[cast:.*]] = bitcast float 0.0{{.*}} to i32
+; CHECK-NEXT: %[[shr:.*]] = lshr i32 %[[cast]], 16
+; CHECK-NEXT: %[[trunc:.*]] = trunc i32 %[[shr]] to i16
+; CHECK-NEXT: ret i16 %[[trunc]]
 
 entry:
   %a = alloca [4 x i8]
@@ -1012,3 +1015,34 @@ entry:
   ret i32 %valcast2
 ; CHECK: ret i32
 }
+
+define void @PR14059.1(double* %d) {
+; In PR14059 a peculiar construct was identified as something that is used
+; pervasively in ARM's ABI-calling-convention lowering: the passing of a struct
+; of doubles via an array of i32 in order to place the data into integer
+; registers. This in turn was missed as an optimization by SROA due to the
+; partial loads and stores of integers to the double alloca we were trying to
+; form and promote. The solution is to widen the integer operations to be
+; whole-alloca operations, and perform the appropriate bitcasting on the
+; *values* rather than the pointers. When this works, partial reads and writes
+; via integers can be promoted away.
+; CHECK: @PR14059.1
+; CHECK-NOT: alloca
+; CHECK: ret void
+
+entry:
+  %X.sroa.0.i = alloca double, align 8
+  %0 = bitcast double* %X.sroa.0.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0)
+  %X.sroa.0.0.cast2.i = bitcast double* %X.sroa.0.i to i32*
+  store i32 0, i32* %X.sroa.0.0.cast2.i, align 8
+  %X.sroa.0.4.raw_idx4.i = getelementptr inbounds i8* %0, i32 4
+  %X.sroa.0.4.cast5.i = bitcast i8* %X.sroa.0.4.raw_idx4.i to i32*
+  store i32 1072693248, i32* %X.sroa.0.4.cast5.i, align 4
+  %X.sroa.0.0.load1.i = load double* %X.sroa.0.i, align 8
+  %accum.real.i = load double* %d, align 8
+  %add.r.i = fadd double %accum.real.i, %X.sroa.0.0.load1.i
+  store double %add.r.i, double* %d, align 8
+  call void @llvm.lifetime.end(i64 -1, i8* %0)
+  ret void
+}
diff --git a/test/Transforms/SROA/phi-and-select.ll b/test/Transforms/SROA/phi-and-select.ll
index 2b0724c7fd..d95e48f303 100644
--- a/test/Transforms/SROA/phi-and-select.ll
+++ b/test/Transforms/SROA/phi-and-select.ll
@@ -256,17 +256,17 @@ entry:
   ret i32 %loaded
 }
 
-define i32 @test10(i32 %b, i32* %ptr) {
+define float @test10(i32 %b, float* %ptr) {
 ; Don't try to promote allocas which are not elligible for it even after
 ; rewriting due to the necessity of inserting bitcasts when speculating a PHI
 ; node.
 ; CHECK: @test10
 ; CHECK: %[[alloca:.*]] = alloca
-; CHECK: %[[argvalue:.*]] = load i32* %ptr
-; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to i32*
-; CHECK: %[[allocavalue:.*]] = load i32* %[[cast]]
-; CHECK: %[[result:.*]] = phi i32 [ %[[allocavalue]], %else ], [ %[[argvalue]], %then ]
-; CHECK-NEXT: ret i32 %[[result]]
+; CHECK: %[[argvalue:.*]] = load float* %ptr
+; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to float*
+; CHECK: %[[allocavalue:.*]] = load float* %[[cast]]
+; CHECK: %[[result:.*]] = phi float [ %[[allocavalue]], %else ], [ %[[argvalue]], %then ]
+; CHECK-NEXT: ret float %[[result]]
 
 entry:
   %f = alloca double
@@ -278,34 +278,34 @@ then:
   br label %exit
 
 else:
-  %bitcast = bitcast double* %f to i32*
+  %bitcast = bitcast double* %f to float*
   br label %exit
 
 exit:
-  %phi = phi i32* [ %bitcast, %else ], [ %ptr, %then ]
-  %loaded = load i32* %phi, align 4
-  ret i32 %loaded
+  %phi = phi float* [ %bitcast, %else ], [ %ptr, %then ]
+  %loaded = load float* %phi, align 4
+  ret float %loaded
 }
 
-define i32 @test11(i32 %b, i32* %ptr) {
+define float @test11(i32 %b, float* %ptr) {
 ; Same as @test10 but for a select rather than a PHI node.
 ; CHECK: @test11
 ; CHECK: %[[alloca:.*]] = alloca
-; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to i32*
-; CHECK: %[[allocavalue:.*]] = load i32* %[[cast]]
-; CHECK: %[[argvalue:.*]] = load i32* %ptr
-; CHECK: %[[result:.*]] = select i1 %{{.*}}, i32 %[[allocavalue]], i32 %[[argvalue]]
-; CHECK-NEXT: ret i32 %[[result]]
+; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to float*
+; CHECK: %[[allocavalue:.*]] = load float* %[[cast]]
+; CHECK: %[[argvalue:.*]] = load float* %ptr
+; CHECK: %[[result:.*]] = select i1 %{{.*}}, float %[[allocavalue]], float %[[argvalue]]
+; CHECK-NEXT: ret float %[[result]]
 
 entry:
   %f = alloca double
   store double 0.0, double* %f
-  store i32 0, i32* %ptr
+  store float 0.0, float* %ptr
   %test = icmp ne i32 %b, 0
-  %bitcast = bitcast double* %f to i32*
-  %select = select i1 %test, i32* %bitcast, i32* %ptr
-  %loaded = load i32* %select, align 4
-  ret i32 %loaded
+  %bitcast = bitcast double* %f to float*
+  %select = select i1 %test, float* %bitcast, float* %ptr
+  %loaded = load float* %select, align 4
+  ret float %loaded
 }
 
 define i32 @test12(i32 %x, i32* %p) {
author	Chandler Carruth <chandlerc@gmail.com>	2012-10-15 08:40:30 +0000
committer	Chandler Carruth <chandlerc@gmail.com>	2012-10-15 08:40:30 +0000
commit	81ff90db4403f9b4ee3574f00ebdfba4a12177c9 (patch)
tree	f4e8b9d46e61feb969d555c76cb114a3095befce /test/Transforms/SROA
parent	11cb6ba5d0fe65c7249ef94c83df294b88d575f0 (diff)
download	llvm-81ff90db4403f9b4ee3574f00ebdfba4a12177c9.tar.gz llvm-81ff90db4403f9b4ee3574f00ebdfba4a12177c9.tar.bz2 llvm-81ff90db4403f9b4ee3574f00ebdfba4a12177c9.tar.xz