diff options
author | Filipe Cabecinhas <me@filcab.net> | 2014-04-25 23:51:17 +0000 |
---|---|---|
committer | Filipe Cabecinhas <me@filcab.net> | 2014-04-25 23:51:17 +0000 |
commit | 3c0216517260d98da308950c1bf93f9767a24fb6 (patch) | |
tree | 91266485bd59c2fff3e7af008e6fd54f9824fb77 /test/CodeGen | |
parent | cee7abfb2c9b517f0120bbf8da04f433ebba942a (diff) | |
download | llvm-3c0216517260d98da308950c1bf93f9767a24fb6.tar.gz llvm-3c0216517260d98da308950c1bf93f9767a24fb6.tar.bz2 llvm-3c0216517260d98da308950c1bf93f9767a24fb6.tar.xz |
Optimization for certain shufflevector by using insertps.
Summary:
If we're doing a v4f32/v4i32 shuffle on x86 with SSE4.1, we can lower
certain shufflevectors to an insertps instruction:
When most of the shufflevector result's elements come from one vector (and
keep their index), and one element comes from another vector or a memory
operand.
Added tests for insertps optimizations on shufflevector.
Added support and tests for v4i32 vector optimization.
Reviewers: nadav
Subscribers: llvm-commits
Differential Revision: http://reviews.llvm.org/D3475
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@207291 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen')
-rw-r--r-- | test/CodeGen/X86/sse41.ll | 75 |
1 files changed, 73 insertions, 2 deletions
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index c15e24ccc9..03f5c9fbb5 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 -; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK +; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK @g16 = external global i16 @@ -249,3 +249,74 @@ entry: ; X64: ret } +define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) { +entry: + %0 = load <4 x float>* %pb, align 16 + %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> + ret <4 x float> %vecinit6 +; CHECK-LABEL: insertps_from_shufflevector_1: +; CHECK-NOT: shufps +; CHECK: insertps $48, +; CHECK: ret +} + +define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) { +entry: + %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3> + ret <4 x float> %vecinit6 +; CHECK-LABEL: insertps_from_shufflevector_2: +; CHECK-NOT: mov +; CHECK-NOT: shufps +; CHECK: insertps $96, +; CHECK: ret +} + +; For loading an i32 from memory into an xmm register we use pinsrd +; instead of insertps +define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) { +entry: + %0 = load <4 x i32>* %pb, align 16 + %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4> + ret <4 x i32> %vecinit6 +; CHECK-LABEL: pinsrd_from_shufflevector_i32: +; CHECK-NOT: mov +; CHECK-NOT: shufps +; CHECK: pinsrd $3, +; CHECK: ret +} + +define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) { +entry: + %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3> + ret <4 x i32> %vecinit6 +; CHECK-LABEL: insertps_from_shufflevector_i32_2: +; CHECK-NOT: mov +; CHECK-NOT: shufps +; CHECK: insertps $208, +; CHECK: ret +} + +define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) { +; CHECK-LABEL: insertps_from_load_ins_elt_undef: +; CHECK-NOT: mov +; CHECK-NOT: shufps +; CHECK: insertps $16, +; CHECK: ret + %1 = load float* %b, align 4 + %2 = insertelement <4 x float> undef, float %1, i32 0 + %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3> + ret <4 x float> %result +} + +define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) { +; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32: +; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr +;; aCHECK-NOT: mov +; CHECK-NOT: shufps +; CHECK: insertps $32, +; CHECK: ret + %1 = load i32* %b, align 4 + %2 = insertelement <4 x i32> undef, i32 %1, i32 0 + %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3> + ret <4 x i32> %result +} |