From 029a76b0a26981afb6bc3252b2a75b16393d893e Mon Sep 17 00:00:00 2001 From: Andrea Di Biagio Date: Wed, 12 Feb 2014 23:43:47 +0000 Subject: [Vectorizer] Add a new 'OperandValueKind' in TargetTransformInfo called 'OK_NonUniformConstValue' to identify operands which are constants but not constant splats. The cost model now allows returning 'OK_NonUniformConstValue' for non splat operands that are instances of ConstantVector or ConstantDataVector. With this change, targets are now able to compute different costs for instructions with non-uniform constant operands. For example, On X86 the cost of a vector shift may vary depending on whether the second operand is a uniform or non-uniform constant. This patch applies the following changes: - The cost model computation now takes into account non-uniform constants; - The cost of vector shift instructions has been improved in X86TargetTransformInfo analysis pass; - BBVectorize, SLPVectorizer and LoopVectorize now know how to distinguish between non-uniform and uniform constant operands. Added a new test to verify that the output of opt '-cost-model -analyze' is valid in the following configurations: SSE2, SSE4.1, AVX, AVX2. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@201272 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/Analysis/CostModel/X86/vshift-cost.ll | 167 +++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 test/Analysis/CostModel/X86/vshift-cost.ll (limited to 'test/Analysis') diff --git a/test/Analysis/CostModel/X86/vshift-cost.ll b/test/Analysis/CostModel/X86/vshift-cost.ll new file mode 100644 index 0000000000..84d72463ac --- /dev/null +++ b/test/Analysis/CostModel/X86/vshift-cost.ll @@ -0,0 +1,167 @@ +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41 +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX +; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 + + +; Verify the cost of vector shift left instructions. + +; We always emit a single pmullw in the case of v8i16 vector shifts by +; non-uniform constant. + +define <8 x i16> @test1(<8 x i16> %a) { + %shl = shl <8 x i16> %a, + ret <8 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test1': +; CHECK: Found an estimated cost of 1 for instruction: %shl + + +define <8 x i16> @test2(<8 x i16> %a) { + %shl = shl <8 x i16> %a, + ret <8 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test2': +; CHECK: Found an estimated cost of 1 for instruction: %shl + + +; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction. +; Make sure that the estimated cost is always 1 except for the case where +; we only have SSE2 support. With SSE2, we are forced to special lower the +; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle. + +define <4 x i32> @test3(<4 x i32> %a) { + %shl = shl <4 x i32> %a, + ret <4 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test3': +; SSE2: Found an estimated cost of 6 for instruction: %shl +; SSE41: Found an estimated cost of 1 for instruction: %shl +; AVX: Found an estimated cost of 1 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +define <4 x i32> @test4(<4 x i32> %a) { + %shl = shl <4 x i32> %a, + ret <4 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test4': +; SSE2: Found an estimated cost of 6 for instruction: %shl +; SSE41: Found an estimated cost of 1 for instruction: %shl +; AVX: Found an estimated cost of 1 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; On AVX2 we are able to lower the following shift into a single +; vpsllvq. Therefore, the expected cost is only 1. +; In all other cases, this shift is scalarized as the target does not support +; vpsllv instructions. + +define <2 x i64> @test5(<2 x i64> %a) { + %shl = shl <2 x i64> %a, + ret <2 x i64> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test5': +; SSE2: Found an estimated cost of 20 for instruction: %shl +; SSE41: Found an estimated cost of 20 for instruction: %shl +; AVX: Found an estimated cost of 20 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; v16i16 and v8i32 shift left by non-uniform constant are lowered into +; vector multiply instructions. With AVX (but not AVX2), the vector multiply +; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert. +; +; With AVX2, instruction vpmullw works with 256bit quantities and +; therefore there is no need to split the resulting vector multiply into +; a sequence of two multiply. +; +; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice +; the cost computed in the case of 'test1'. That is because the backend +; simply emits 2 pmullw with no extract/insert. + + +define <16 x i16> @test6(<16 x i16> %a) { + %shl = shl <16 x i16> %a, + ret <16 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test6': +; SSE2: Found an estimated cost of 2 for instruction: %shl +; SSE41: Found an estimated cost of 2 for instruction: %shl +; AVX: Found an estimated cost of 4 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice +; the cost computed in the case of 'test3'. That is because the multiply +; is type-legalized into two 4i32 vector multiply. + +define <8 x i32> @test7(<8 x i32> %a) { + %shl = shl <8 x i32> %a, + ret <8 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test7': +; SSE2: Found an estimated cost of 12 for instruction: %shl +; SSE41: Found an estimated cost of 2 for instruction: %shl +; AVX: Found an estimated cost of 4 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; On AVX2 we are able to lower the following shift into a single +; vpsllvq. Therefore, the expected cost is only 1. +; In all other cases, this shift is scalarized as the target does not support +; vpsllv instructions. + +define <4 x i64> @test8(<4 x i64> %a) { + %shl = shl <4 x i64> %a, + ret <4 x i64> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test8': +; SSE2: Found an estimated cost of 40 for instruction: %shl +; SSE41: Found an estimated cost of 40 for instruction: %shl +; AVX: Found an estimated cost of 40 for instruction: %shl +; AVX2: Found an estimated cost of 1 for instruction: %shl + + +; Same as 'test6', with the difference that the cost is double. + +define <32 x i16> @test9(<32 x i16> %a) { + %shl = shl <32 x i16> %a, + ret <32 x i16> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test9': +; SSE2: Found an estimated cost of 4 for instruction: %shl +; SSE41: Found an estimated cost of 4 for instruction: %shl +; AVX: Found an estimated cost of 8 for instruction: %shl +; AVX2: Found an estimated cost of 2 for instruction: %shl + + +; Same as 'test7', except that now the cost is double. + +define <16 x i32> @test10(<16 x i32> %a) { + %shl = shl <16 x i32> %a, + ret <16 x i32> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test10': +; SSE2: Found an estimated cost of 24 for instruction: %shl +; SSE41: Found an estimated cost of 4 for instruction: %shl +; AVX: Found an estimated cost of 8 for instruction: %shl +; AVX2: Found an estimated cost of 2 for instruction: %shl + + +; On AVX2 we are able to lower the following shift into a sequence of +; two vpsllvq instructions. Therefore, the expected cost is only 2. +; In all other cases, this shift is scalarized as we don't have vpsllv +; instructions. + +define <8 x i64> @test11(<8 x i64> %a) { + %shl = shl <8 x i64> %a, + ret <8 x i64> %shl +} +; CHECK: 'Cost Model Analysis' for function 'test11': +; SSE2: Found an estimated cost of 80 for instruction: %shl +; SSE41: Found an estimated cost of 80 for instruction: %shl +; AVX: Found an estimated cost of 80 for instruction: %shl +; AVX2: Found an estimated cost of 2 for instruction: %shl + + -- cgit v1.2.3