X86 cost model: Differentiate cost for vector shifts of constants

SSE2 has efficient support for shifts by a scalar. My previous change of making shifts expensive did not take this into account marking all shifts as expensive. This would prevent vectorization from happening where it is actually beneficial. With this change we differentiate between shifts of constants and other shifts. radar://13576547 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178808 91177308-0d34-0410-b5e6-96231b3b80d8
author: Arnold Schwaighofer <aschwaighofer@apple.com> 2013-04-04 23:26:24 +0000
committer: Arnold Schwaighofer <aschwaighofer@apple.com> 2013-04-04 23:26:24 +0000
commit: 2537f3c6597bc1b8eb14c76c8f8e7046be41c9ba (patch)
tree: 781143f2b27f08fe01dcfe79e732057fc6847445 /test
parent: 6bf4f676413b8f7d97aaff289997aab344180957 (diff)
download: llvm-2537f3c6597bc1b8eb14c76c8f8e7046be41c9ba.tar.gz
llvm-2537f3c6597bc1b8eb14c76c8f8e7046be41c9ba.tar.bz2
llvm-2537f3c6597bc1b8eb14c76c8f8e7046be41c9ba.tar.xz
3 files changed, 863 insertions, 0 deletions
diff --git a/test/Analysis/CostModel/X86/testshiftashr.ll b/test/Analysis/CostModel/X86/testshiftashr.ll
index d932b2a4c4..f35eea8716 100644
--- a/test/Analysis/CostModel/X86/testshiftashr.ll
+++ b/test/Analysis/CostModel/X86/testshiftashr.ll
@@ -241,3 +241,291 @@ entry:
   ret %shifttype32i8 %0
 }
 
+; Test shift by a constant a value.
+
+%shifttypec = type <2 x i16>
+define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
+entry:
+  ; SSE2: shift2i16const
+  ; SSE2: cost of 20 {{.*}} ashr
+  ; SSE2-CODEGEN: shift2i16const
+  ; SSE2-CODEGEN: sarq $
+
+  %0 = ashr %shifttypec %a , <i16 3, i16 3>
+  ret %shifttypec %0
+}
+
+%shifttypec4i16 = type <4 x i16>
+define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
+entry:
+  ; SSE2: shift4i16const
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN: shift4i16const
+  ; SSE2-CODEGEN: psrad $3
+
+  %0 = ashr %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec4i16 %0
+}
+
+%shifttypec8i16 = type <8 x i16>
+define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
+entry:
+  ; SSE2: shift8i16const
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN: shift8i16const
+  ; SSE2-CODEGEN: psraw $3
+
+  %0 = ashr %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                  i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec8i16 %0
+}
+
+%shifttypec16i16 = type <16 x i16>
+define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
+                                         %shifttypec16i16 %b) {
+entry:
+  ; SSE2: shift16i16const
+  ; SSE2: cost of 2 {{.*}} ashr
+  ; SSE2-CODEGEN: shift16i16const
+  ; SSE2-CODEGEN: psraw $3
+
+  %0 = ashr %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec16i16 %0
+}
+
+%shifttypec32i16 = type <32 x i16>
+define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
+                                        %shifttypec32i16 %b) {
+entry:
+  ; SSE2: shift32i16const
+  ; SSE2: cost of 4 {{.*}} ashr
+  ; SSE2-CODEGEN: shift32i16const
+  ; SSE2-CODEGEN: psraw $3
+
+  %0 = ashr %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec32i16 %0
+}
+
+%shifttypec2i32 = type <2 x i32>
+define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
+entry:
+  ; SSE2: shift2i32c
+  ; SSE2: cost of 20 {{.*}} ashr
+  ; SSE2-CODEGEN: shift2i32c
+  ; SSE2-CODEGEN: sarq $3
+
+  %0 = ashr %shifttypec2i32 %a , <i32 3, i32 3>
+  ret %shifttypec2i32 %0
+}
+
+%shifttypec4i32 = type <4 x i32>
+define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
+entry:
+  ; SSE2: shift4i32c
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN: shift4i32c
+  ; SSE2-CODEGEN: psrad $3
+
+  %0 = ashr %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec4i32 %0
+}
+
+%shifttypec8i32 = type <8 x i32>
+define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
+entry:
+  ; SSE2: shift8i32c
+  ; SSE2: cost of 2 {{.*}} ashr
+  ; SSE2-CODEGEN: shift8i32c
+  ; SSE2-CODEGEN: psrad $3
+
+  %0 = ashr %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                  i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec8i32 %0
+}
+
+%shifttypec16i32 = type <16 x i32>
+define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
+entry:
+  ; SSE2: shift16i32c
+  ; SSE2: cost of 4 {{.*}} ashr
+  ; SSE2-CODEGEN: shift16i32c
+  ; SSE2-CODEGEN: psrad $3
+
+  %0 = ashr %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec16i32 %0
+}
+
+%shifttypec32i32 = type <32 x i32>
+define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
+entry:
+  ; SSE2: shift32i32c
+  ; getTypeConversion fails here and promotes this to a i64.
+  ; SSE2: cost of 256 {{.*}} ashr
+  ; SSE2-CODEGEN: shift32i32c
+  ; SSE2-CODEGEN: psrad $3
+  %0 = ashr %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec32i32 %0
+}
+
+%shifttypec2i64 = type <2 x i64>
+define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
+entry:
+  ; SSE2: shift2i64c
+  ; SSE2: cost of 20 {{.*}} ashr
+  ; SSE2-CODEGEN: shift2i64c
+  ; SSE2-CODEGEN: sarq $3
+
+  %0 = ashr %shifttypec2i64 %a , <i64 3, i64 3>
+  ret %shifttypec2i64 %0
+}
+
+%shifttypec4i64 = type <4 x i64>
+define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
+entry:
+  ; SSE2: shift4i64c
+  ; SSE2: cost of 40 {{.*}} ashr
+  ; SSE2-CODEGEN: shift4i64c
+  ; SSE2-CODEGEN: sarq $3
+
+  %0 = ashr %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec4i64 %0
+}
+
+%shifttypec8i64 = type <8 x i64>
+define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
+entry:
+  ; SSE2: shift8i64c
+  ; SSE2: cost of 80 {{.*}} ashr
+  ; SSE2-CODEGEN: shift8i64c
+  ; SSE2-CODEGEN: sarq $3
+
+ %0 = ashr %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
+                                 i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec8i64 %0
+}
+
+%shifttypec16i64 = type <16 x i64>
+define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
+entry:
+  ; SSE2: shift16i64c
+  ; SSE2: cost of 160 {{.*}} ashr
+  ; SSE2-CODEGEN: shift16i64c
+  ; SSE2-CODEGEN: sarq $3
+
+  %0 = ashr %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec16i64 %0
+}
+
+%shifttypec32i64 = type <32 x i64>
+define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
+entry:
+  ; SSE2: shift32i64c
+  ; SSE2: cost of 256 {{.*}} ashr
+  ; SSE2-CODEGEN: shift32i64c
+  ; SSE2-CODEGEN: sarq $3
+
+  %0 = ashr %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec32i64 %0
+}
+
+%shifttypec2i8 = type <2 x i8>
+define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
+entry:
+  ; SSE2: shift2i8c
+  ; SSE2: cost of 20 {{.*}} ashr
+  ; SSE2-CODEGEN: shift2i8c
+  ; SSE2-CODEGEN: sarq $3
+
+  %0 = ashr %shifttypec2i8 %a , <i8 3, i8 3>
+  ret %shifttypec2i8 %0
+}
+
+%shifttypec4i8 = type <4 x i8>
+define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
+entry:
+  ; SSE2: shift4i8c
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN: shift4i8c
+  ; SSE2-CODEGEN: psrad $3
+
+  %0 = ashr %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec4i8 %0
+}
+
+%shifttypec8i8 = type <8 x i8>
+define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
+entry:
+  ; SSE2: shift8i8c
+  ; SSE2: cost of 1 {{.*}} ashr
+  ; SSE2-CODEGEN: shift8i8c
+  ; SSE2-CODEGEN: psraw $3
+
+  %0 = ashr %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                 i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec8i8 %0
+}
+
+%shifttypec16i8 = type <16 x i8>
+define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
+entry:
+  ; SSE2: shift16i8c
+  ; SSE2: cost of 4 {{.*}} ashr
+  ; SSE2-CODEGEN: shift16i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = ashr %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec16i8 %0
+}
+
+%shifttypec32i8 = type <32 x i8>
+define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
+entry:
+  ; SSE2: shift32i8c
+  ; SSE2: cost of 8 {{.*}} ashr
+  ; SSE2-CODEGEN: shift32i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = ashr %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec32i8 %0
+}
+
diff --git a/test/Analysis/CostModel/X86/testshiftlshr.ll b/test/Analysis/CostModel/X86/testshiftlshr.ll
index 7d665fc9c6..8d6ef38742 100644
--- a/test/Analysis/CostModel/X86/testshiftlshr.ll
+++ b/test/Analysis/CostModel/X86/testshiftlshr.ll
@@ -241,3 +241,290 @@ entry:
   ret %shifttype32i8 %0
 }
 
+; Test shift by a constant vector.
+
+%shifttypec = type <2 x i16>
+define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
+entry:
+  ; SSE2: shift2i16const
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: shift2i16const
+  ; SSE2-CODEGEN: psrlq $3
+
+  %0 = lshr %shifttypec %a , <i16 3, i16 3>
+  ret %shifttypec %0
+}
+
+%shifttypec4i16 = type <4 x i16>
+define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
+entry:
+  ; SSE2: shift4i16const
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: shift4i16const
+  ; SSE2-CODEGEN: psrld $3
+
+  %0 = lshr %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec4i16 %0
+}
+
+%shifttypec8i16 = type <8 x i16>
+define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
+entry:
+  ; SSE2: shift8i16const
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: shift8i16const
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                  i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec8i16 %0
+}
+
+%shifttypec16i16 = type <16 x i16>
+define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
+                                         %shifttypec16i16 %b) {
+entry:
+  ; SSE2: shift16i16const
+  ; SSE2: cost of 2 {{.*}} lshr
+  ; SSE2-CODEGEN: shift16i16const
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec16i16 %0
+}
+
+%shifttypec32i16 = type <32 x i16>
+define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
+                                        %shifttypec32i16 %b) {
+entry:
+  ; SSE2: shift32i16const
+  ; SSE2: cost of 4 {{.*}} lshr
+  ; SSE2-CODEGEN: shift32i16const
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec32i16 %0
+}
+
+%shifttypec2i32 = type <2 x i32>
+define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
+entry:
+  ; SSE2: shift2i32c
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: shift2i32c
+  ; SSE2-CODEGEN: psrlq $3
+
+  %0 = lshr %shifttypec2i32 %a , <i32 3, i32 3>
+  ret %shifttypec2i32 %0
+}
+
+%shifttypec4i32 = type <4 x i32>
+define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
+entry:
+  ; SSE2: shift4i32c
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: shift4i32c
+  ; SSE2-CODEGEN: psrld $3
+
+  %0 = lshr %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec4i32 %0
+}
+
+%shifttypec8i32 = type <8 x i32>
+define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
+entry:
+  ; SSE2: shift8i32c
+  ; SSE2: cost of 2 {{.*}} lshr
+  ; SSE2-CODEGEN: shift8i32c
+  ; SSE2-CODEGEN: psrld $3
+
+  %0 = lshr %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                  i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec8i32 %0
+}
+
+%shifttypec16i32 = type <16 x i32>
+define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
+entry:
+  ; SSE2: shift16i32c
+  ; SSE2: cost of 4 {{.*}} lshr
+  ; SSE2-CODEGEN: shift16i32c
+  ; SSE2-CODEGEN: psrld $3
+
+  %0 = lshr %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec16i32 %0
+}
+
+%shifttypec32i32 = type <32 x i32>
+define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
+entry:
+  ; SSE2: shift32i32c
+  ; getTypeConversion fails here and promotes this to a i64.
+  ; SSE2: cost of 256 {{.*}} lshr
+  ; SSE2-CODEGEN: shift32i32c
+  ; SSE2-CODEGEN: psrld $3
+  %0 = lshr %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec32i32 %0
+}
+
+%shifttypec2i64 = type <2 x i64>
+define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
+entry:
+  ; SSE2: shift2i64c
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: shift2i64c
+  ; SSE2-CODEGEN: psrlq $3
+
+  %0 = lshr %shifttypec2i64 %a , <i64 3, i64 3>
+  ret %shifttypec2i64 %0
+}
+
+%shifttypec4i64 = type <4 x i64>
+define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
+entry:
+  ; SSE2: shift4i64c
+  ; SSE2: cost of 2 {{.*}} lshr
+  ; SSE2-CODEGEN: shift4i64c
+  ; SSE2-CODEGEN: psrlq $3
+
+  %0 = lshr %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec4i64 %0
+}
+
+%shifttypec8i64 = type <8 x i64>
+define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
+entry:
+  ; SSE2: shift8i64c
+  ; SSE2: cost of 4 {{.*}} lshr
+  ; SSE2-CODEGEN: shift8i64c
+  ; SSE2-CODEGEN: psrlq $3
+
+ %0 = lshr %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
+                                 i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec8i64 %0
+}
+
+%shifttypec16i64 = type <16 x i64>
+define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
+entry:
+  ; SSE2: shift16i64c
+  ; SSE2: cost of 8 {{.*}} lshr
+  ; SSE2-CODEGEN: shift16i64c
+  ; SSE2-CODEGEN: psrlq $3
+
+  %0 = lshr %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec16i64 %0
+}
+
+%shifttypec32i64 = type <32 x i64>
+define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
+entry:
+  ; SSE2: shift32i64c
+  ; SSE2: cost of 256 {{.*}} lshr
+  ; SSE2-CODEGEN: shift32i64c
+  ; SSE2-CODEGEN: psrlq $3
+
+  %0 = lshr %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec32i64 %0
+}
+
+%shifttypec2i8 = type <2 x i8>
+define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
+entry:
+  ; SSE2: shift2i8c
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: shift2i8c
+  ; SSE2-CODEGEN: psrlq $3
+
+  %0 = lshr %shifttypec2i8 %a , <i8 3, i8 3>
+  ret %shifttypec2i8 %0
+}
+
+%shifttypec4i8 = type <4 x i8>
+define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
+entry:
+  ; SSE2: shift4i8c
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: shift4i8c
+  ; SSE2-CODEGEN: psrld $3
+
+  %0 = lshr %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec4i8 %0
+}
+
+%shifttypec8i8 = type <8 x i8>
+define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
+entry:
+  ; SSE2: shift8i8c
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: shift8i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                 i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec8i8 %0
+}
+
+%shifttypec16i8 = type <16 x i8>
+define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
+entry:
+  ; SSE2: shift16i8c
+  ; SSE2: cost of 1 {{.*}} lshr
+  ; SSE2-CODEGEN: shift16i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec16i8 %0
+}
+
+%shifttypec32i8 = type <32 x i8>
+define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
+entry:
+  ; SSE2: shift32i8c
+  ; SSE2: cost of 2 {{.*}} lshr
+  ; SSE2-CODEGEN: shift32i8c
+  ; SSE2-CODEGEN: psrlw $3
+
+  %0 = lshr %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec32i8 %0
+}
diff --git a/test/Analysis/CostModel/X86/testshiftshl.ll b/test/Analysis/CostModel/X86/testshiftshl.ll
index 897d9832e5..f45a698792 100644
--- a/test/Analysis/CostModel/X86/testshiftshl.ll
+++ b/test/Analysis/CostModel/X86/testshiftshl.ll
@@ -240,3 +240,291 @@ entry:
   %0 = shl %shifttype32i8 %a , %b
   ret %shifttype32i8 %0
 }
+
+; Test shift by a constant vector.
+
+%shifttypec = type <2 x i16>
+define %shifttypec @shift2i16const(%shifttypec %a, %shifttypec %b) {
+entry:
+  ; SSE2: shift2i16const
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: shift2i16const
+  ; SSE2-CODEGEN: psllq $3
+
+  %0 = shl %shifttypec %a , <i16 3, i16 3>
+  ret %shifttypec %0
+}
+
+%shifttypec4i16 = type <4 x i16>
+define %shifttypec4i16 @shift4i16const(%shifttypec4i16 %a, %shifttypec4i16 %b) {
+entry:
+  ; SSE2: shift4i16const
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: shift4i16const
+  ; SSE2-CODEGEN: pslld $3
+
+  %0 = shl %shifttypec4i16 %a , <i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec4i16 %0
+}
+
+%shifttypec8i16 = type <8 x i16>
+define %shifttypec8i16 @shift8i16const(%shifttypec8i16 %a, %shifttypec8i16 %b) {
+entry:
+  ; SSE2: shift8i16const
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: shift8i16const
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec8i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                  i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec8i16 %0
+}
+
+%shifttypec16i16 = type <16 x i16>
+define %shifttypec16i16 @shift16i16const(%shifttypec16i16 %a,
+                                         %shifttypec16i16 %b) {
+entry:
+  ; SSE2: shift16i16const
+  ; SSE2: cost of 2 {{.*}} shl
+  ; SSE2-CODEGEN: shift16i16const
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec16i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec16i16 %0
+}
+
+%shifttypec32i16 = type <32 x i16>
+define %shifttypec32i16 @shift32i16const(%shifttypec32i16 %a,
+                                        %shifttypec32i16 %b) {
+entry:
+  ; SSE2: shift32i16const
+  ; SSE2: cost of 4 {{.*}} shl
+  ; SSE2-CODEGEN: shift32i16const
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec32i16 %a , <i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3,
+                                   i16 3, i16 3, i16 3, i16 3>
+  ret %shifttypec32i16 %0
+}
+
+%shifttypec2i32 = type <2 x i32>
+define %shifttypec2i32 @shift2i32c(%shifttypec2i32 %a, %shifttypec2i32 %b) {
+entry:
+  ; SSE2: shift2i32c
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: shift2i32c
+  ; SSE2-CODEGEN: psllq $3
+
+  %0 = shl %shifttypec2i32 %a , <i32 3, i32 3>
+  ret %shifttypec2i32 %0
+}
+
+%shifttypec4i32 = type <4 x i32>
+define %shifttypec4i32 @shift4i32c(%shifttypec4i32 %a, %shifttypec4i32 %b) {
+entry:
+  ; SSE2: shift4i32c
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: shift4i32c
+  ; SSE2-CODEGEN: pslld $3
+
+  %0 = shl %shifttypec4i32 %a , <i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec4i32 %0
+}
+
+%shifttypec8i32 = type <8 x i32>
+define %shifttypec8i32 @shift8i32c(%shifttypec8i32 %a, %shifttypec8i32 %b) {
+entry:
+  ; SSE2: shift8i32c
+  ; SSE2: cost of 2 {{.*}} shl
+  ; SSE2-CODEGEN: shift8i32c
+  ; SSE2-CODEGEN: pslld $3
+
+  %0 = shl %shifttypec8i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                  i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec8i32 %0
+}
+
+%shifttypec16i32 = type <16 x i32>
+define %shifttypec16i32 @shift16i32c(%shifttypec16i32 %a, %shifttypec16i32 %b) {
+entry:
+  ; SSE2: shift16i32c
+  ; SSE2: cost of 4 {{.*}} shl
+  ; SSE2-CODEGEN: shift16i32c
+  ; SSE2-CODEGEN: pslld $3
+
+  %0 = shl %shifttypec16i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec16i32 %0
+}
+
+%shifttypec32i32 = type <32 x i32>
+define %shifttypec32i32 @shift32i32c(%shifttypec32i32 %a, %shifttypec32i32 %b) {
+entry:
+  ; SSE2: shift32i32c
+  ; getTypeConversion fails here and promotes this to a i64.
+  ; SSE2: cost of 256 {{.*}} shl
+  ; SSE2-CODEGEN: shift32i32c
+  ; SSE2-CODEGEN: pslld $3
+  %0 = shl %shifttypec32i32 %a , <i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3,
+                                   i32 3, i32 3, i32 3, i32 3>
+  ret %shifttypec32i32 %0
+}
+
+%shifttypec2i64 = type <2 x i64>
+define %shifttypec2i64 @shift2i64c(%shifttypec2i64 %a, %shifttypec2i64 %b) {
+entry:
+  ; SSE2: shift2i64c
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: shift2i64c
+  ; SSE2-CODEGEN: psllq $3
+
+  %0 = shl %shifttypec2i64 %a , <i64 3, i64 3>
+  ret %shifttypec2i64 %0
+}
+
+%shifttypec4i64 = type <4 x i64>
+define %shifttypec4i64 @shift4i64c(%shifttypec4i64 %a, %shifttypec4i64 %b) {
+entry:
+  ; SSE2: shift4i64c
+  ; SSE2: cost of 2 {{.*}} shl
+  ; SSE2-CODEGEN: shift4i64c
+  ; SSE2-CODEGEN: psllq $3
+
+  %0 = shl %shifttypec4i64 %a , <i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec4i64 %0
+}
+
+%shifttypec8i64 = type <8 x i64>
+define %shifttypec8i64 @shift8i64c(%shifttypec8i64 %a, %shifttypec8i64 %b) {
+entry:
+  ; SSE2: shift8i64c
+  ; SSE2: cost of 4 {{.*}} shl
+  ; SSE2-CODEGEN: shift8i64c
+  ; SSE2-CODEGEN: psllq $3
+
+ %0 = shl %shifttypec8i64 %a , <i64 3, i64 3, i64 3, i64 3,
+                                 i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec8i64 %0
+}
+
+%shifttypec16i64 = type <16 x i64>
+define %shifttypec16i64 @shift16i64c(%shifttypec16i64 %a, %shifttypec16i64 %b) {
+entry:
+  ; SSE2: shift16i64c
+  ; SSE2: cost of 8 {{.*}} shl
+  ; SSE2-CODEGEN: shift16i64c
+  ; SSE2-CODEGEN: psllq $3
+
+  %0 = shl %shifttypec16i64 %a , <i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3,
+                                   i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec16i64 %0
+}
+
+%shifttypec32i64 = type <32 x i64>
+define %shifttypec32i64 @shift32i64c(%shifttypec32i64 %a, %shifttypec32i64 %b) {
+entry:
+  ; SSE2: shift32i64c
+  ; SSE2: cost of 256 {{.*}} shl
+  ; SSE2-CODEGEN: shift32i64c
+  ; SSE2-CODEGEN: psllq $3
+
+  %0 = shl %shifttypec32i64 %a ,<i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3,
+                                  i64 3, i64 3, i64 3, i64 3>
+  ret %shifttypec32i64 %0
+}
+
+%shifttypec2i8 = type <2 x i8>
+define %shifttypec2i8 @shift2i8c(%shifttypec2i8 %a, %shifttypec2i8 %b) {
+entry:
+  ; SSE2: shift2i8c
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: shift2i8c
+  ; SSE2-CODEGEN: psllq $3
+
+  %0 = shl %shifttypec2i8 %a , <i8 3, i8 3>
+  ret %shifttypec2i8 %0
+}
+
+%shifttypec4i8 = type <4 x i8>
+define %shifttypec4i8 @shift4i8c(%shifttypec4i8 %a, %shifttypec4i8 %b) {
+entry:
+  ; SSE2: shift4i8c
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: shift4i8c
+  ; SSE2-CODEGEN: pslld $3
+
+  %0 = shl %shifttypec4i8 %a , <i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec4i8 %0
+}
+
+%shifttypec8i8 = type <8 x i8>
+define %shifttypec8i8 @shift8i8c(%shifttypec8i8 %a, %shifttypec8i8 %b) {
+entry:
+  ; SSE2: shift8i8c
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: shift8i8c
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec8i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                 i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec8i8 %0
+}
+
+%shifttypec16i8 = type <16 x i8>
+define %shifttypec16i8 @shift16i8c(%shifttypec16i8 %a, %shifttypec16i8 %b) {
+entry:
+  ; SSE2: shift16i8c
+  ; SSE2: cost of 1 {{.*}} shl
+  ; SSE2-CODEGEN: shift16i8c
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec16i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec16i8 %0
+}
+
+%shifttypec32i8 = type <32 x i8>
+define %shifttypec32i8 @shift32i8c(%shifttypec32i8 %a, %shifttypec32i8 %b) {
+entry:
+  ; SSE2: shift32i8c
+  ; SSE2: cost of 2 {{.*}} shl
+  ; SSE2-CODEGEN: shift32i8c
+  ; SSE2-CODEGEN: psllw $3
+
+  %0 = shl %shifttypec32i8 %a , <i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3,
+                                  i8 3, i8 3, i8 3, i8 3>
+  ret %shifttypec32i8 %0
+}
author	Arnold Schwaighofer <aschwaighofer@apple.com>	2013-04-04 23:26:24 +0000
committer	Arnold Schwaighofer <aschwaighofer@apple.com>	2013-04-04 23:26:24 +0000
commit	2537f3c6597bc1b8eb14c76c8f8e7046be41c9ba (patch)
tree	781143f2b27f08fe01dcfe79e732057fc6847445 /test
parent	6bf4f676413b8f7d97aaff289997aab344180957 (diff)
download	llvm-2537f3c6597bc1b8eb14c76c8f8e7046be41c9ba.tar.gz llvm-2537f3c6597bc1b8eb14c76c8f8e7046be41c9ba.tar.bz2 llvm-2537f3c6597bc1b8eb14c76c8f8e7046be41c9ba.tar.xz