PPC: Initial support for permutation-based unaligned Altivec loads

Altivec only directly supports aligned loads, but the loads have a strange property: If given an unaligned address, they truncate the address to the next lower aligned address, and load from there. This property, along with an extra load and some special-purpose permutation-control instructions that generate the appropriate permutations from the original unaligned address, allow efficient lowering of aligned loads. This code uses the trick explained in the Apple Velocity Engine optimization overview document to prevent the needed extra load from possibly causing a page fault if the original address happens to be aligned. As noted in the FIXMEs, there are several additional optimizations that can be performed to reduce the cost of these loads even more. These will be implemented in future commits. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182691 91177308-0d34-0410-b5e6-96231b3b80d8
author: Hal Finkel <hfinkel@anl.gov> 2013-05-24 23:00:14 +0000
committer: Hal Finkel <hfinkel@anl.gov> 2013-05-24 23:00:14 +0000
commit: 80d10ded8cd4f34b87d82b03d6f63328ea337b26 (patch)
tree: 238b9081baf073661f9ae2bfc3ed8e22f7935110 /test/CodeGen
parent: 3b77151a61d2985ad5b29ee3d05b34d553322c2a (diff)
download: llvm-80d10ded8cd4f34b87d82b03d6f63328ea337b26.tar.gz
llvm-80d10ded8cd4f34b87d82b03d6f63328ea337b26.tar.bz2
llvm-80d10ded8cd4f34b87d82b03d6f63328ea337b26.tar.xz
1 files changed, 45 insertions, 0 deletions
diff --git a/test/CodeGen/PowerPC/unal-altivec.ll b/test/CodeGen/PowerPC/unal-altivec.ll
new file mode 100644
index 0000000000..f89f299e14
--- /dev/null
+++ b/test/CodeGen/PowerPC/unal-altivec.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mcpu=g5 | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @foo(float* noalias nocapture %a, float* noalias nocapture %b) #0 {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds float* %b, i64 %index
+  %1 = bitcast float* %0 to <4 x float>*
+  %wide.load = load <4 x float>* %1, align 4
+  %.sum11 = or i64 %index, 4
+  %2 = getelementptr float* %b, i64 %.sum11
+  %3 = bitcast float* %2 to <4 x float>*
+  %wide.load8 = load <4 x float>* %3, align 4
+  %4 = fadd <4 x float> %wide.load, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %5 = fadd <4 x float> %wide.load8, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %6 = getelementptr inbounds float* %a, i64 %index
+  %7 = bitcast float* %6 to <4 x float>*
+  store <4 x float> %4, <4 x float>* %7, align 4
+  %.sum12 = or i64 %index, 4
+  %8 = getelementptr float* %a, i64 %.sum12
+  %9 = bitcast float* %8 to <4 x float>*
+  store <4 x float> %5, <4 x float>* %9, align 4
+  %index.next = add i64 %index, 8
+  %10 = icmp eq i64 %index.next, 16000
+  br i1 %10, label %for.end, label %vector.body
+
+; CHECK: @foo
+; CHECK: lvx [[CNST:[0-9]+]],
+; CHECK-DAG: lvsl [[PC:[0-9]+]], [[B1:[0-9]+]], [[B2:[0-9]+]]
+; CHECK-DAG: lvx [[LD1:[0-9]+]], [[B1]], [[B2]]
+; CHECK-DAG: add [[B3:[0-9]+]], [[B1]], [[B2]]
+; CHECK-DAG: lvx [[LD2:[0-9]+]], [[B3]],
+; CHECK-DAG: vperm [[R1:[0-9]+]], [[LD1]], [[LD2]], [[PC]]
+; CHECK: vaddfp {{[0-9]+}}, [[R1]], [[CNST]]
+; CHECK: blr
+
+for.end:                                          ; preds = %vector.body
+  ret void
+}
+
+attributes #0 = { nounwind }
author	Hal Finkel <hfinkel@anl.gov>	2013-05-24 23:00:14 +0000
committer	Hal Finkel <hfinkel@anl.gov>	2013-05-24 23:00:14 +0000
commit	80d10ded8cd4f34b87d82b03d6f63328ea337b26 (patch)
tree	238b9081baf073661f9ae2bfc3ed8e22f7935110 /test/CodeGen
parent	3b77151a61d2985ad5b29ee3d05b34d553322c2a (diff)
download	llvm-80d10ded8cd4f34b87d82b03d6f63328ea337b26.tar.gz llvm-80d10ded8cd4f34b87d82b03d6f63328ea337b26.tar.bz2 llvm-80d10ded8cd4f34b87d82b03d6f63328ea337b26.tar.xz