From 029a76b0a26981afb6bc3252b2a75b16393d893e Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Wed, 12 Feb 2014 23:43:47 +0000
Subject: [Vectorizer] Add a new 'OperandValueKind' in TargetTransformInfo
 called 'OK_NonUniformConstValue' to identify operands which are constants but
 not constant splats.

The cost model now allows returning 'OK_NonUniformConstValue'
for non splat operands that are instances of ConstantVector or
ConstantDataVector.

With this change, targets are now able to compute different costs
for instructions with non-uniform constant operands.
For example, On X86 the cost of a vector shift may vary depending on whether
the second operand is a uniform or non-uniform constant.

This patch applies the following changes:
 - The cost model computation now takes into account non-uniform constants;
 - The cost of vector shift instructions has been improved in
   X86TargetTransformInfo analysis pass;
 - BBVectorize, SLPVectorizer and LoopVectorize now know how to distinguish
   between non-uniform and uniform constant operands.

Added a new test to verify that the output of opt
'-cost-model -analyze' is valid in the following configurations: SSE2,
SSE4.1, AVX, AVX2.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@201272 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Analysis/CostModel/X86/vshift-cost.ll | 167 +++++++++++++++++++++++++++++
 1 file changed, 167 insertions(+)
 create mode 100644 test/Analysis/CostModel/X86/vshift-cost.ll

(limited to 'test/Analysis')
diff --git a/test/Analysis/CostModel/X86/vshift-cost.ll b/test/Analysis/CostModel/X86/vshift-cost.ll
new file mode 100644
index 0000000000..84d72463ac
--- /dev/null
+++ b/test/Analysis/CostModel/X86/vshift-cost.ll
@@ -0,0 +1,167 @@
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=+sse2,-sse4.1 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE41
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+
+
+; Verify the cost of vector shift left instructions.
+
+; We always emit a single pmullw in the case of v8i16 vector shifts by
+; non-uniform constant.
+
+define <8 x i16> @test1(<8 x i16> %a) {
+  %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+  ret <8 x i16> %shl
+}
+; CHECK: 'Cost Model Analysis' for function 'test1':
+; CHECK: Found an estimated cost of 1 for instruction:   %shl
+
+
+define <8 x i16> @test2(<8 x i16> %a) {
+  %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
+  ret <8 x i16> %shl
+}
+; CHECK: 'Cost Model Analysis' for function 'test2':
+; CHECK: Found an estimated cost of 1 for instruction:   %shl
+
+
+; With SSE4.1, v4i32 shifts can be lowered into a single pmulld instruction.
+; Make sure that the estimated cost is always 1 except for the case where
+; we only have SSE2 support. With SSE2, we are forced to special lower the
+; v4i32 mul as a 2x shuffle, 2x pmuludq, 2x shuffle.
+
+define <4 x i32> @test3(<4 x i32> %a) {
+  %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
+  ret <4 x i32> %shl
+}
+; CHECK: 'Cost Model Analysis' for function 'test3':
+; SSE2: Found an estimated cost of 6 for instruction:   %shl
+; SSE41: Found an estimated cost of 1 for instruction:   %shl
+; AVX: Found an estimated cost of 1 for instruction:   %shl
+; AVX2: Found an estimated cost of 1 for instruction:   %shl
+
+
+define <4 x i32> @test4(<4 x i32> %a) {
+  %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
+  ret <4 x i32> %shl
+}
+; CHECK: 'Cost Model Analysis' for function 'test4':
+; SSE2: Found an estimated cost of 6 for instruction:   %shl
+; SSE41: Found an estimated cost of 1 for instruction:   %shl
+; AVX: Found an estimated cost of 1 for instruction:   %shl
+; AVX2: Found an estimated cost of 1 for instruction:   %shl
+
+
+; On AVX2 we are able to lower the following shift into a single
+; vpsllvq. Therefore, the expected cost is only 1.
+; In all other cases, this shift is scalarized as the target does not support
+; vpsllv instructions.
+
+define <2 x i64> @test5(<2 x i64> %a) {
+  %shl = shl <2 x i64> %a, <i64 2, i64 3>
+  ret <2 x i64> %shl
+}
+; CHECK: 'Cost Model Analysis' for function 'test5':
+; SSE2: Found an estimated cost of 20 for instruction:   %shl
+; SSE41: Found an estimated cost of 20 for instruction:   %shl
+; AVX: Found an estimated cost of 20 for instruction:   %shl
+; AVX2: Found an estimated cost of 1 for instruction:   %shl
+
+
+; v16i16 and v8i32 shift left by non-uniform constant are lowered into
+; vector multiply instructions.  With AVX (but not AVX2), the vector multiply
+; is lowered into a sequence of: 1 extract + 2 vpmullw + 1 insert.
+;
+; With AVX2, instruction vpmullw works with 256bit quantities and
+; therefore there is no need to split the resulting vector multiply into
+; a sequence of two multiply.
+;
+; With SSE2 and SSE4.1, the vector shift cost for 'test6' is twice
+; the cost computed in the case of 'test1'. That is because the backend
+; simply emits 2 pmullw with no extract/insert.
+
+
+define <16 x i16> @test6(<16 x i16> %a) {
+  %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+  ret <16 x i16> %shl
+}
+; CHECK: 'Cost Model Analysis' for function 'test6':
+; SSE2: Found an estimated cost of 2 for instruction:   %shl
+; SSE41: Found an estimated cost of 2 for instruction:   %shl
+; AVX: Found an estimated cost of 4 for instruction:   %shl
+; AVX2: Found an estimated cost of 1 for instruction:   %shl
+
+
+; With SSE2 and SSE4.1, the vector shift cost for 'test7' is twice
+; the cost computed in the case of 'test3'. That is because the multiply
+; is type-legalized into two 4i32 vector multiply.
+
+define <8 x i32> @test7(<8 x i32> %a) {
+  %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
+  ret <8 x i32> %shl
+}
+; CHECK: 'Cost Model Analysis' for function 'test7':
+; SSE2: Found an estimated cost of 12 for instruction:   %shl
+; SSE41: Found an estimated cost of 2 for instruction:   %shl
+; AVX: Found an estimated cost of 4 for instruction:   %shl
+; AVX2: Found an estimated cost of 1 for instruction:   %shl
+
+
+; On AVX2 we are able to lower the following shift into a single
+; vpsllvq. Therefore, the expected cost is only 1.
+; In all other cases, this shift is scalarized as the target does not support
+; vpsllv instructions.
+
+define <4 x i64> @test8(<4 x i64> %a) {
+  %shl = shl <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
+  ret <4 x i64> %shl
+}
+; CHECK: 'Cost Model Analysis' for function 'test8':
+; SSE2: Found an estimated cost of 40 for instruction:   %shl
+; SSE41: Found an estimated cost of 40 for instruction:   %shl
+; AVX: Found an estimated cost of 40 for instruction:   %shl
+; AVX2: Found an estimated cost of 1 for instruction:   %shl
+
+
+; Same as 'test6', with the difference that the cost is double.
+
+define <32 x i16> @test9(<32 x i16> %a) {
+  %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+  ret <32 x i16> %shl
+}
+; CHECK: 'Cost Model Analysis' for function 'test9':
+; SSE2: Found an estimated cost of 4 for instruction:   %shl
+; SSE41: Found an estimated cost of 4 for instruction:   %shl
+; AVX: Found an estimated cost of 8 for instruction:   %shl
+; AVX2: Found an estimated cost of 2 for instruction:   %shl
+
+
+; Same as 'test7', except that now the cost is double.
+
+define <16 x i32> @test10(<16 x i32> %a) {
+  %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
+  ret <16 x i32> %shl
+}
+; CHECK: 'Cost Model Analysis' for function 'test10':
+; SSE2: Found an estimated cost of 24 for instruction:   %shl
+; SSE41: Found an estimated cost of 4 for instruction:   %shl
+; AVX: Found an estimated cost of 8 for instruction:   %shl
+; AVX2: Found an estimated cost of 2 for instruction:   %shl
+
+
+; On AVX2 we are able to lower the following shift into a sequence of
+; two vpsllvq instructions. Therefore, the expected cost is only 2.
+; In all other cases, this shift is scalarized as we don't have vpsllv
+; instructions.
+
+define <8 x i64> @test11(<8 x i64> %a) {
+  %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
+  ret <8 x i64> %shl
+}
+; CHECK: 'Cost Model Analysis' for function 'test11':
+; SSE2: Found an estimated cost of 80 for instruction:   %shl
+; SSE41: Found an estimated cost of 80 for instruction:   %shl
+; AVX: Found an estimated cost of 80 for instruction:   %shl
+; AVX2: Found an estimated cost of 2 for instruction:   %shl
+
+
-- 
cgit v1.2.3