summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJim Grosbach <grosbach@apple.com>2013-08-29 22:41:46 +0000
committerJim Grosbach <grosbach@apple.com>2013-08-29 22:41:46 +0000
commit88acef0b8e93d065aa4de164422ce4c546a7cd5f (patch)
treed062d7830b9109b7951bc1e8658ebc32d518574c
parentff372dc18d0d569cbcfb7346eb891d6a96219043 (diff)
downloadllvm-88acef0b8e93d065aa4de164422ce4c546a7cd5f.tar.gz
llvm-88acef0b8e93d065aa4de164422ce4c546a7cd5f.tar.bz2
llvm-88acef0b8e93d065aa4de164422ce4c546a7cd5f.tar.xz
ARM: Improve pattern for isel mul of vector by scalar.
In addition to recognizing when the multiply's second argument is coming from an explicit VDUPLANE, also look for a plain scalar f32 reference and reference it via the corresponding vector lane. rdar://14870054 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189619 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td11
-rw-r--r--test/CodeGen/ARM/vmul.ll18
2 files changed, 29 insertions, 0 deletions
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 49ae3348cd..f1bd37ea52 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -4022,6 +4022,17 @@ def : Pat<(v4f32 (fmul (v4f32 QPR:$src1),
(DSubReg_i32_reg imm:$lane))),
(SubReg_i32_lane imm:$lane)))>;
+
+def : Pat<(v2f32 (fmul DPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+ (VMULslfd DPR:$Rn,
+ (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+ (i32 0))>;
+def : Pat<(v4f32 (fmul QPR:$Rn, (NEONvdup (f32 SPR:$Rm)))),
+ (VMULslfq QPR:$Rn,
+ (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), SPR:$Rm, ssub_0),
+ (i32 0))>;
+
+
// VQDMULH : Vector Saturating Doubling Multiply Returning High Half
defm VQDMULH : N3VInt_HS<0, 0, 0b1011, 0, N3RegFrm, IIC_VMULi16D, IIC_VMULi32D,
IIC_VMULi16Q, IIC_VMULi32Q,
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index 6210ad3695..5e5e99bc2f 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -623,3 +623,21 @@ entry:
store <4 x i32> %predphi290.v.i, <4 x i32>* undef, align 4
ret void
}
+
+define void @foo(<4 x float> * %a, <4 x float>* nocapture %dst, float* nocapture readonly %src) nounwind {
+; Look for doing a normal scalar FP load rather than an to-all-lanes load.
+; e.g., "ldr s0, [r2]" rathern than "vld1.32 {d18[], d19[]}, [r2:32]"
+; Then check that the vector multiply has folded the splat to all lanes
+; and used a vector * scalar instruction.
+; CHECK: vldr {{s[0-9]+}}, [r2]
+; CHECK: vmul.f32 q8, q8, d0[0]
+ %tmp = load float* %src, align 4
+ %tmp5 = load <4 x float>* %a, align 4
+ %tmp6 = insertelement <4 x float> undef, float %tmp, i32 0
+ %tmp7 = insertelement <4 x float> %tmp6, float %tmp, i32 1
+ %tmp8 = insertelement <4 x float> %tmp7, float %tmp, i32 2
+ %tmp9 = insertelement <4 x float> %tmp8, float %tmp, i32 3
+ %tmp10 = fmul <4 x float> %tmp9, %tmp5
+ store <4 x float> %tmp10, <4 x float>* %dst, align 4
+ ret void
+}