Teach SROA how to split whole-alloca integer loads and stores into

smaller integer loads and stores. The high-level motivation is that the frontend sometimes generates a single whole-alloca integer load or store during ABI lowering of splittable allocas. We need to be able to break this apart in order to see the underlying elements and properly promote them to SSA values. The hope is that this fixes some performance regressions on x86-32 with the new SROA pass. Unfortunately, this causes quite a bit of churn in the test cases, and bloats some IR that comes out. When we see an alloca that consists soley of bits and bytes being extracted and re-inserted, we now do some splitting first, before building widened integer "bucket of bits" representations. These are always well folded by instcombine however, so this shouldn't actually result in missed opportunities. If this splitting of all-integer allocas does cause problems (perhaps due to smaller SSA values going into the RA), we could potentially go to some extreme measures to only do this integer splitting trick when there are non-integer component accesses of an alloca, but discovering this is quite expensive: it adds yet another complete walk of the recursive use tree of the alloca. Either way, I will be watching build bots and LNT bots to see what fallout there is here. If anyone gets x86-32 numbers before & after this change, I would be very interested. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166662 91177308-0d34-0410-b5e6-96231b3b80d8
author: Chandler Carruth <chandlerc@gmail.com> 2012-10-25 04:37:07 +0000
committer: Chandler Carruth <chandlerc@gmail.com> 2012-10-25 04:37:07 +0000
commit: a2b88163af30f59d12ae0172565f3406bdbf6c45 (patch)
tree: 2bca1e0574a0720c4ceba0c74b31231ddf27efd6 /test
parent: 8dbac7b529cfb73bcd0ceef514e5c1d247cf3baa (diff)
download: llvm-a2b88163af30f59d12ae0172565f3406bdbf6c45.tar.gz
llvm-a2b88163af30f59d12ae0172565f3406bdbf6c45.tar.bz2
llvm-a2b88163af30f59d12ae0172565f3406bdbf6c45.tar.xz
2 files changed, 85 insertions, 23 deletions
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
index 03120f7a32..110950f76a 100644
--- a/test/Transforms/SROA/basictest.ll
+++ b/test/Transforms/SROA/basictest.ll
@@ -577,9 +577,17 @@ entry:
   %ai = load i24* %aiptr
 ; CHCEK-NOT: store
 ; CHCEK-NOT: load
-; CHECK:      %[[mask0:.*]] = and i24 undef, -256
-; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[mask0]], -65281
-; CHECK-NEXT: %[[mask2:.*]] = and i24 %[[mask1]], 65535
+; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift2:.*]] = shl i24 %[[ext2]], 16
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, 65535
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[shift2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], -256
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[ext0]]
 
   %biptr = bitcast [3 x i8]* %b to i24*
   store i24 %ai, i24* %biptr
@@ -591,10 +599,10 @@ entry:
   %b2 = load i8* %b2ptr
 ; CHCEK-NOT: store
 ; CHCEK-NOT: load
-; CHECK:      %[[trunc0:.*]] = trunc i24 %[[mask2]] to i8
-; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[mask2]], 8
+; CHECK:      %[[trunc0:.*]] = trunc i24 %[[insert0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
 ; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
-; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[mask2]], 16
+; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[insert0]], 16
 ; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[shift2]] to i8
 
   %bsum0 = add i8 %b0, %b1
@@ -1064,6 +1072,49 @@ entry:
   ret void
 }
 
+define i64 @PR14059.2({ float, float }* %phi) {
+; Check that SROA can split up alloca-wide integer loads and stores where the
+; underlying alloca has smaller components that are accessed independently. This
+; shows up particularly with ABI lowering patterns coming out of Clang that rely
+; on the particular register placement of a single large integer return value.
+; CHECK: @PR14059.2
+
+entry:
+  %retval = alloca { float, float }, align 4
+  ; CHECK-NOT: alloca
+
+  %0 = bitcast { float, float }* %retval to i64*
+  store i64 0, i64* %0
+  ; CHECK-NOT: store
+
+  %phi.realp = getelementptr inbounds { float, float }* %phi, i32 0, i32 0
+  %phi.real = load float* %phi.realp
+  %phi.imagp = getelementptr inbounds { float, float }* %phi, i32 0, i32 1
+  %phi.imag = load float* %phi.imagp
+  ; CHECK:      %[[realp:.*]] = getelementptr inbounds { float, float }* %phi, i32 0, i32 0
+  ; CHECK-NEXT: %[[real:.*]] = load float* %[[realp]]
+  ; CHECK-NEXT: %[[imagp:.*]] = getelementptr inbounds { float, float }* %phi, i32 0, i32 1
+  ; CHECK-NEXT: %[[imag:.*]] = load float* %[[imagp]]
+
+  %real = getelementptr inbounds { float, float }* %retval, i32 0, i32 0
+  %imag = getelementptr inbounds { float, float }* %retval, i32 0, i32 1
+  store float %phi.real, float* %real
+  store float %phi.imag, float* %imag
+  ; CHECK-NEXT: %[[imag_convert:.*]] = bitcast float %[[imag]] to i32
+  ; CHECK-NEXT: %[[imag_ext:.*]] = zext i32 %[[imag_convert]] to i64
+  ; CHECK-NEXT: %[[imag_shift:.*]] = shl i64 %[[imag_ext]], 32
+  ; CHECK-NEXT: %[[imag_mask:.*]] = and i64 undef, 4294967295
+  ; CHECK-NEXT: %[[imag_insert:.*]] = or i64 %[[imag_mask]], %[[imag_shift]]
+  ; CHECK-NEXT: %[[real_convert:.*]] = bitcast float %[[real]] to i32
+  ; CHECK-NEXT: %[[real_ext:.*]] = zext i32 %[[real_convert]] to i64
+  ; CHECK-NEXT: %[[real_mask:.*]] = and i64 %[[imag_insert]], -4294967296
+  ; CHECK-NEXT: %[[real_insert:.*]] = or i64 %[[real_mask]], %[[real_ext]]
+
+  %1 = load i64* %0, align 1
+  ret i64 %1
+  ; CHECK-NEXT: ret i64 %[[real_insert]]
+}
+
 define void @PR14105({ [16 x i8] }* %ptr) {
 ; Ensure that when rewriting the GEP index '-1' for this alloca we preserve is
 ; sign as negative. We use a volatile memcpy to ensure promotion never actually
diff --git a/test/Transforms/SROA/big-endian.ll b/test/Transforms/SROA/big-endian.ll
index 532f8690cf..ce82d1f30b 100644
--- a/test/Transforms/SROA/big-endian.ll
+++ b/test/Transforms/SROA/big-endian.ll
@@ -26,9 +26,17 @@ entry:
   %ai = load i24* %aiptr
 ; CHCEK-NOT: store
 ; CHCEK-NOT: load
-; CHECK:      %[[mask0:.*]] = and i24 undef, 65535
-; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[mask0]], -65281
-; CHECK-NEXT: %[[mask2:.*]] = and i24 %[[mask1]], -256
+; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]]
 
   %biptr = bitcast [3 x i8]* %b to i24*
   store i24 %ai, i24* %biptr
@@ -40,11 +48,11 @@ entry:
   %b2 = load i8* %b2ptr
 ; CHCEK-NOT: store
 ; CHCEK-NOT: load
-; CHECK:      %[[shift0:.*]] = lshr i24 %[[mask2]], 16
+; CHECK:      %[[shift0:.*]] = lshr i24 %[[insert0]], 16
 ; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
-; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[mask2]], 8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
 ; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
-; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[mask2]] to i8
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8
 
   %bsum0 = add i8 %b0, %b1
   %bsum1 = add i8 %bsum0, %b2
@@ -74,27 +82,26 @@ entry:
 
   %a0i16ptr = bitcast i8* %a0ptr to i16*
   store i16 1, i16* %a0i16ptr
-; CHECK:      %[[mask:.*]] = and i56 undef, 1099511627775
-; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 1099511627776
+; CHECK:      %[[mask0:.*]] = and i16 1, -16
 
   %a1i4ptr = bitcast i8* %a1ptr to i4*
   store i4 1, i4* %a1i4ptr
-; CHECK:      %[[mask:.*]] = and i56 %[[or]], -16492674416641
-; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 1099511627776
+; CHECK-NEXT: %[[insert0:.*]] = or i16 %[[mask0]], 1
 
   store i8 1, i8* %a2ptr
-; CHECK-NEXT: %[[mask:.*]] = and i56 %[[or]], -1095216660481
-; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 4294967296
+; CHECK-NEXT: %[[mask1:.*]] = and i40 undef, 4294967295
+; CHECK-NEXT: %[[insert1:.*]] = or i40 %[[mask1]], 4294967296
 
   %a3i24ptr = bitcast i8* %a3ptr to i24*
   store i24 1, i24* %a3i24ptr
-; CHECK-NEXT: %[[mask:.*]] = and i56 %[[or]], -4294967041
-; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 256
+; CHECK-NEXT: %[[mask2:.*]] = and i40 %[[insert1]], -4294967041
+; CHECK-NEXT: %[[insert2:.*]] = or i40 %[[mask2]], 256
 
   %a2i40ptr = bitcast i8* %a2ptr to i40*
   store i40 1, i40* %a2i40ptr
-; CHECK-NEXT: %[[mask:.*]] = and i56 %[[or]], -1099511627776
-; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 1
+; CHECK-NEXT: %[[ext3:.*]] = zext i40 1 to i56
+; CHECK-NEXT: %[[mask3:.*]] = and i56 undef, -1099511627776
+; CHECK-NEXT: %[[insert3:.*]] = or i56 %[[mask3]], %[[ext3]]
 
 ; CHCEK-NOT: store
 ; CHCEK-NOT: load
@@ -103,6 +110,10 @@ entry:
   %ai = load i56* %aiptr
   %ret = zext i56 %ai to i64
   ret i64 %ret
-; CHECK:      %[[ret:.*]] = zext i56 %[[or]] to i64
+; CHECK-NEXT: %[[ext4:.*]] = zext i16 %[[insert0]] to i56
+; CHECK-NEXT: %[[shift4:.*]] = shl i56 %[[ext4]], 40
+; CHECK-NEXT: %[[mask4:.*]] = and i56 %[[insert3]], 1099511627775
+; CHECK-NEXT: %[[insert4:.*]] = or i56 %[[mask4]], %[[shift4]]
+; CHECK-NEXT: %[[ret:.*]] = zext i56 %[[insert4]] to i64
 ; CHECK-NEXT: ret i64 %[[ret]]
 }
author	Chandler Carruth <chandlerc@gmail.com>	2012-10-25 04:37:07 +0000
committer	Chandler Carruth <chandlerc@gmail.com>	2012-10-25 04:37:07 +0000
commit	a2b88163af30f59d12ae0172565f3406bdbf6c45 (patch)
tree	2bca1e0574a0720c4ceba0c74b31231ddf27efd6 /test
parent	8dbac7b529cfb73bcd0ceef514e5c1d247cf3baa (diff)
download	llvm-a2b88163af30f59d12ae0172565f3406bdbf6c45.tar.gz llvm-a2b88163af30f59d12ae0172565f3406bdbf6c45.tar.bz2 llvm-a2b88163af30f59d12ae0172565f3406bdbf6c45.tar.xz