From 2f8a6cdfa3bc0bfa4532da89e574666c5251cdb5 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Sat, 22 Dec 2012 16:07:56 +0000 Subject: X86: Turn mul of <4 x i32> into pmuludq when no SSE4.1 is available. pmuludq is slow, but it turns out that all the unpacking and packing of the scalarized mul is even slower. 10% speedup on loop-vectorized paq8p. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@170985 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/sse2-mul.ll | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 test/CodeGen/X86/sse2-mul.ll (limited to 'test/CodeGen/X86/sse2-mul.ll') diff --git a/test/CodeGen/X86/sse2-mul.ll b/test/CodeGen/X86/sse2-mul.ll new file mode 100644 index 0000000000..0466d60ec3 --- /dev/null +++ b/test/CodeGen/X86/sse2-mul.ll @@ -0,0 +1,14 @@ +; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s + +define <4 x i32> @test1(<4 x i32> %x, <4 x i32> %y) { + %m = mul <4 x i32> %x, %y + ret <4 x i32> %m +; CHECK: test1: +; CHECK: pshufd $49 +; CHECK: pmuludq +; CHECK: pshufd $49 +; CHECK: pmuludq +; CHECK: shufps $-120 +; CHECK: pshufd $-40 +; CHECK: ret +} -- cgit v1.2.3