summaryrefslogtreecommitdiff
path: root/test/CodeGen
diff options
context:
space:
mode:
authorFilipe Cabecinhas <me@filcab.net>2014-04-25 23:51:17 +0000
committerFilipe Cabecinhas <me@filcab.net>2014-04-25 23:51:17 +0000
commit3c0216517260d98da308950c1bf93f9767a24fb6 (patch)
tree91266485bd59c2fff3e7af008e6fd54f9824fb77 /test/CodeGen
parentcee7abfb2c9b517f0120bbf8da04f433ebba942a (diff)
downloadllvm-3c0216517260d98da308950c1bf93f9767a24fb6.tar.gz
llvm-3c0216517260d98da308950c1bf93f9767a24fb6.tar.bz2
llvm-3c0216517260d98da308950c1bf93f9767a24fb6.tar.xz
Optimization for certain shufflevector by using insertps.
Summary: If we're doing a v4f32/v4i32 shuffle on x86 with SSE4.1, we can lower certain shufflevectors to an insertps instruction: When most of the shufflevector result's elements come from one vector (and keep their index), and one element comes from another vector or a memory operand. Added tests for insertps optimizations on shufflevector. Added support and tests for v4i32 vector optimization. Reviewers: nadav Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D3475 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207291 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen')
-rw-r--r--test/CodeGen/X86/sse41.ll75
1 files changed, 73 insertions, 2 deletions
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index c15e24ccc9..03f5c9fbb5 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
@g16 = external global i16
@@ -249,3 +249,74 @@ entry:
; X64: ret
}
+define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+entry:
+ %0 = load <4 x float>* %pb, align 16
+ %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x float> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_1:
+; CHECK-NOT: shufps
+; CHECK: insertps $48,
+; CHECK: ret
+}
+
+define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
+entry:
+ %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+ ret <4 x float> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_2:
+; CHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: insertps $96,
+; CHECK: ret
+}
+
+; For loading an i32 from memory into an xmm register we use pinsrd
+; instead of insertps
+define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
+entry:
+ %0 = load <4 x i32>* %pb, align 16
+ %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+ ret <4 x i32> %vecinit6
+; CHECK-LABEL: pinsrd_from_shufflevector_i32:
+; CHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: pinsrd $3,
+; CHECK: ret
+}
+
+define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
+entry:
+ %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+ ret <4 x i32> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_i32_2:
+; CHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: insertps $208,
+; CHECK: ret
+}
+
+define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
+; CHECK-LABEL: insertps_from_load_ins_elt_undef:
+; CHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: insertps $16,
+; CHECK: ret
+ %1 = load float* %b, align 4
+ %2 = insertelement <4 x float> undef, float %1, i32 0
+ %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+ ret <4 x float> %result
+}
+
+define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
+; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
+;; aCHECK-NOT: mov
+; CHECK-NOT: shufps
+; CHECK: insertps $32,
+; CHECK: ret
+ %1 = load i32* %b, align 4
+ %2 = insertelement <4 x i32> undef, i32 %1, i32 0
+ %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+ ret <4 x i32> %result
+}