summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHal Finkel <hfinkel@anl.gov>2013-09-15 22:09:58 +0000
committerHal Finkel <hfinkel@anl.gov>2013-09-15 22:09:58 +0000
commitfabfb5d5880354983c89c6f475312dd359e5bb03 (patch)
tree5df205fea4949ba0c64e38416d9aee31b440cf00
parent94ee55d4b39d6506cf4e0f4e4b1c0b7fbbfeaed5 (diff)
downloadllvm-fabfb5d5880354983c89c6f475312dd359e5bb03.tar.gz
llvm-fabfb5d5880354983c89c6f475312dd359e5bb03.tar.bz2
llvm-fabfb5d5880354983c89c6f475312dd359e5bb03.tar.xz
PPC: Don't restrict lvsl generation to after type legalization
This is a re-commit of r190764, with an extra check to make sure that we're not performing the transformation on illegal types (a small test case has been added for this as well). Original commit message: The PPC backend uses a target-specific DAG combine to turn unaligned Altivec loads into a permutation-based sequence when possible. Unfortunately, the target-specific DAG combine is not always called on all loads of interest (sometimes the routines in DAGCombine call CombineTo such that the new node and users are not added to the worklist); allowing the combine to trigger early (before type legalization) mitigates this problem. Because the autovectorizers only create legal vector types, I don't expect a lot of cases where this optimization is enabled by type legalization in practice. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@190771 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp3
-rw-r--r--test/CodeGen/PowerPC/unal-altivec2.ll166
2 files changed, 168 insertions, 1 deletions
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index ae046f0b7a..83bd1abdde 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -7177,7 +7177,8 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
unsigned ABIAlignment = getDataLayout()->getABITypeAlignment(Ty);
if (ISD::isNON_EXTLoad(N) && VT.isVector() &&
TM.getSubtarget<PPCSubtarget>().hasAltivec() &&
- DCI.getDAGCombineLevel() == AfterLegalizeTypes &&
+ (VT == MVT::v16i8 || VT == MVT::v8i16 ||
+ VT == MVT::v4i32 || VT == MVT::v4f32) &&
LD->getAlignment() < ABIAlignment) {
// This is a type-legal unaligned Altivec load.
SDValue Chain = LD->getChain();
diff --git a/test/CodeGen/PowerPC/unal-altivec2.ll b/test/CodeGen/PowerPC/unal-altivec2.ll
new file mode 100644
index 0000000000..7464675470
--- /dev/null
+++ b/test/CodeGen/PowerPC/unal-altivec2.ll
@@ -0,0 +1,166 @@
+; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: nounwind
+define void @foo(float* noalias nocapture %x, float* noalias nocapture readonly %y) #0 {
+entry:
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+; CHECK-LABEL: @foo
+; CHECK: lvsl
+; CHECK: blr
+ %index = phi i64 [ 0, %entry ], [ %index.next.15, %vector.body ]
+ %0 = getelementptr inbounds float* %y, i64 %index
+ %1 = bitcast float* %0 to <4 x float>*
+ %wide.load = load <4 x float>* %1, align 4
+ %2 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load)
+ %3 = getelementptr inbounds float* %x, i64 %index
+ %4 = bitcast float* %3 to <4 x float>*
+ store <4 x float> %2, <4 x float>* %4, align 4
+ %index.next = add i64 %index, 4
+ %5 = getelementptr inbounds float* %y, i64 %index.next
+ %6 = bitcast float* %5 to <4 x float>*
+ %wide.load.1 = load <4 x float>* %6, align 4
+ %7 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.1)
+ %8 = getelementptr inbounds float* %x, i64 %index.next
+ %9 = bitcast float* %8 to <4 x float>*
+ store <4 x float> %7, <4 x float>* %9, align 4
+ %index.next.1 = add i64 %index.next, 4
+ %10 = getelementptr inbounds float* %y, i64 %index.next.1
+ %11 = bitcast float* %10 to <4 x float>*
+ %wide.load.2 = load <4 x float>* %11, align 4
+ %12 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.2)
+ %13 = getelementptr inbounds float* %x, i64 %index.next.1
+ %14 = bitcast float* %13 to <4 x float>*
+ store <4 x float> %12, <4 x float>* %14, align 4
+ %index.next.2 = add i64 %index.next.1, 4
+ %15 = getelementptr inbounds float* %y, i64 %index.next.2
+ %16 = bitcast float* %15 to <4 x float>*
+ %wide.load.3 = load <4 x float>* %16, align 4
+ %17 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.3)
+ %18 = getelementptr inbounds float* %x, i64 %index.next.2
+ %19 = bitcast float* %18 to <4 x float>*
+ store <4 x float> %17, <4 x float>* %19, align 4
+ %index.next.3 = add i64 %index.next.2, 4
+ %20 = getelementptr inbounds float* %y, i64 %index.next.3
+ %21 = bitcast float* %20 to <4 x float>*
+ %wide.load.4 = load <4 x float>* %21, align 4
+ %22 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.4)
+ %23 = getelementptr inbounds float* %x, i64 %index.next.3
+ %24 = bitcast float* %23 to <4 x float>*
+ store <4 x float> %22, <4 x float>* %24, align 4
+ %index.next.4 = add i64 %index.next.3, 4
+ %25 = getelementptr inbounds float* %y, i64 %index.next.4
+ %26 = bitcast float* %25 to <4 x float>*
+ %wide.load.5 = load <4 x float>* %26, align 4
+ %27 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.5)
+ %28 = getelementptr inbounds float* %x, i64 %index.next.4
+ %29 = bitcast float* %28 to <4 x float>*
+ store <4 x float> %27, <4 x float>* %29, align 4
+ %index.next.5 = add i64 %index.next.4, 4
+ %30 = getelementptr inbounds float* %y, i64 %index.next.5
+ %31 = bitcast float* %30 to <4 x float>*
+ %wide.load.6 = load <4 x float>* %31, align 4
+ %32 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.6)
+ %33 = getelementptr inbounds float* %x, i64 %index.next.5
+ %34 = bitcast float* %33 to <4 x float>*
+ store <4 x float> %32, <4 x float>* %34, align 4
+ %index.next.6 = add i64 %index.next.5, 4
+ %35 = getelementptr inbounds float* %y, i64 %index.next.6
+ %36 = bitcast float* %35 to <4 x float>*
+ %wide.load.7 = load <4 x float>* %36, align 4
+ %37 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.7)
+ %38 = getelementptr inbounds float* %x, i64 %index.next.6
+ %39 = bitcast float* %38 to <4 x float>*
+ store <4 x float> %37, <4 x float>* %39, align 4
+ %index.next.7 = add i64 %index.next.6, 4
+ %40 = getelementptr inbounds float* %y, i64 %index.next.7
+ %41 = bitcast float* %40 to <4 x float>*
+ %wide.load.8 = load <4 x float>* %41, align 4
+ %42 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.8)
+ %43 = getelementptr inbounds float* %x, i64 %index.next.7
+ %44 = bitcast float* %43 to <4 x float>*
+ store <4 x float> %42, <4 x float>* %44, align 4
+ %index.next.8 = add i64 %index.next.7, 4
+ %45 = getelementptr inbounds float* %y, i64 %index.next.8
+ %46 = bitcast float* %45 to <4 x float>*
+ %wide.load.9 = load <4 x float>* %46, align 4
+ %47 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.9)
+ %48 = getelementptr inbounds float* %x, i64 %index.next.8
+ %49 = bitcast float* %48 to <4 x float>*
+ store <4 x float> %47, <4 x float>* %49, align 4
+ %index.next.9 = add i64 %index.next.8, 4
+ %50 = getelementptr inbounds float* %y, i64 %index.next.9
+ %51 = bitcast float* %50 to <4 x float>*
+ %wide.load.10 = load <4 x float>* %51, align 4
+ %52 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.10)
+ %53 = getelementptr inbounds float* %x, i64 %index.next.9
+ %54 = bitcast float* %53 to <4 x float>*
+ store <4 x float> %52, <4 x float>* %54, align 4
+ %index.next.10 = add i64 %index.next.9, 4
+ %55 = getelementptr inbounds float* %y, i64 %index.next.10
+ %56 = bitcast float* %55 to <4 x float>*
+ %wide.load.11 = load <4 x float>* %56, align 4
+ %57 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.11)
+ %58 = getelementptr inbounds float* %x, i64 %index.next.10
+ %59 = bitcast float* %58 to <4 x float>*
+ store <4 x float> %57, <4 x float>* %59, align 4
+ %index.next.11 = add i64 %index.next.10, 4
+ %60 = getelementptr inbounds float* %y, i64 %index.next.11
+ %61 = bitcast float* %60 to <4 x float>*
+ %wide.load.12 = load <4 x float>* %61, align 4
+ %62 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.12)
+ %63 = getelementptr inbounds float* %x, i64 %index.next.11
+ %64 = bitcast float* %63 to <4 x float>*
+ store <4 x float> %62, <4 x float>* %64, align 4
+ %index.next.12 = add i64 %index.next.11, 4
+ %65 = getelementptr inbounds float* %y, i64 %index.next.12
+ %66 = bitcast float* %65 to <4 x float>*
+ %wide.load.13 = load <4 x float>* %66, align 4
+ %67 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.13)
+ %68 = getelementptr inbounds float* %x, i64 %index.next.12
+ %69 = bitcast float* %68 to <4 x float>*
+ store <4 x float> %67, <4 x float>* %69, align 4
+ %index.next.13 = add i64 %index.next.12, 4
+ %70 = getelementptr inbounds float* %y, i64 %index.next.13
+ %71 = bitcast float* %70 to <4 x float>*
+ %wide.load.14 = load <4 x float>* %71, align 4
+ %72 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.14)
+ %73 = getelementptr inbounds float* %x, i64 %index.next.13
+ %74 = bitcast float* %73 to <4 x float>*
+ store <4 x float> %72, <4 x float>* %74, align 4
+ %index.next.14 = add i64 %index.next.13, 4
+ %75 = getelementptr inbounds float* %y, i64 %index.next.14
+ %76 = bitcast float* %75 to <4 x float>*
+ %wide.load.15 = load <4 x float>* %76, align 4
+ %77 = call <4 x float> @llvm_cos_v4f32(<4 x float> %wide.load.15)
+ %78 = getelementptr inbounds float* %x, i64 %index.next.14
+ %79 = bitcast float* %78 to <4 x float>*
+ store <4 x float> %77, <4 x float>* %79, align 4
+ %index.next.15 = add i64 %index.next.14, 4
+ %80 = icmp eq i64 %index.next.15, 2048
+ br i1 %80, label %for.end, label %vector.body
+
+for.end: ; preds = %vector.body
+ ret void
+}
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm_cos_v4f32(<4 x float>) #1
+
+define <2 x double> @bar(double* %x) {
+entry:
+ %p = bitcast double* %x to <2 x double>*
+ %r = load <2 x double>* %p, align 8
+
+; CHECK-LABEL: @bar
+; CHECK-NOT: lvsl
+; CHECK: blr
+
+ ret <2 x double> %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }