From d16c8d0d336638225378466bc17c9db156401817 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nadav.rotem@intel.com>
Date: Sat, 7 Apr 2012 21:19:08 +0000
Subject: 1. Remove the part of r153848 which optimizes shuffle-of-shuffle into
 a new    shuffle node because it could introduce new shuffle nodes that were
 not    supported efficiently by the target.

2. Add a more restrictive shuffle-of-shuffle optimization for cases where the
   second shuffle reverses the transformation of the first shuffle.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@154266 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 26 ++++++++++++++------------
 test/CodeGen/CellSPU/rotate_ops.ll       |  2 +-
 test/CodeGen/X86/2011-10-27-tstore.ll    | 10 +++++-----
 test/CodeGen/X86/SwizzleShuff.ll         | 25 +++++++++++++++++++++++++
 test/CodeGen/X86/vec_compare-2.ll        |  6 ++++--
 test/CodeGen/X86/vec_shuffle-37.ll       | 10 +++++-----
 test/CodeGen/X86/widen_shuffle-1.ll      |  2 +-
 7 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e0fd3abfb4..ebffb8562e 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7795,19 +7795,20 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   }
 
   // If this shuffle node is simply a swizzle of another shuffle node,
-  // optimize shuffle(shuffle(x, y), undef) -> shuffle(x, y).
+  // and it reverses the swizzle of the previous shuffle then we can
+  // optimize shuffle(shuffle(x, undef), undef) -> x.
   if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
       N1.getOpcode() == ISD::UNDEF) {
 
-    SmallVector<int, 8> NewMask;
     ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
 
-    // If the source shuffle has more than one user then do not try to optimize
-    // it because it may generate a more complex shuffle node. However, if the
-    // source shuffle is also a swizzle (a single source shuffle), our
-    // transformation is still likely to reduce the number of shuffles and only
-    // generate a simple shuffle node.
-    if (N0.getOperand(1).getOpcode() != ISD::UNDEF && !N0.hasOneUse())
+    // Shuffle nodes can only reverse shuffles with a single non-undef value.
+    if (N0.getOperand(1).getOpcode() != ISD::UNDEF)
+      return SDValue();
+
+    // The incoming shuffle must be of the same type as the result of the current
+    // shuffle.
+    if (OtherSV->getOperand(0).getValueType() != VT)
       return SDValue();
 
     EVT InVT = N0.getValueType();
@@ -7824,11 +7825,12 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       if (Idx >= 0)
         Idx = OtherSV->getMaskElt(Idx);
 
-      NewMask.push_back(Idx);
+      // The combined shuffle must map each index to itself.
+      if (Idx != i && Idx != -1)
+        return SDValue();
     }
-    assert(NewMask.size() == VT.getVectorNumElements() && "Invalid mask size");
-    return DAG.getVectorShuffle(VT, N->getDebugLoc(), OtherSV->getOperand(0),
-                                OtherSV->getOperand(1), &NewMask[0]);
+
+    return OtherSV->getOperand(0);
   }
 
   return SDValue();
diff --git a/test/CodeGen/CellSPU/rotate_ops.ll b/test/CodeGen/CellSPU/rotate_ops.ll
index 8b7af20b4a..9770935276 100644
--- a/test/CodeGen/CellSPU/rotate_ops.ll
+++ b/test/CodeGen/CellSPU/rotate_ops.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=cellspu -o %t1.s
-; RUN: grep rot          %t1.s | count 85
+; RUN: grep rot          %t1.s | count 86
 ; RUN: grep roth         %t1.s | count 8
 ; RUN: grep roti.*5      %t1.s | count 1
 ; RUN: grep roti.*27     %t1.s | count 1
diff --git a/test/CodeGen/X86/2011-10-27-tstore.ll b/test/CodeGen/X86/2011-10-27-tstore.ll
index 1712f34565..6e83f6713a 100644
--- a/test/CodeGen/X86/2011-10-27-tstore.ll
+++ b/test/CodeGen/X86/2011-10-27-tstore.ll
@@ -4,13 +4,13 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ;CHECK: ltstore
 ;CHECK: movq
-;CHECK-NEXT: movq
-;CHECK-NEXT: ret
-define void @ltstore(<4 x i32>* %pIn, <2 x i32>* %pOut) {
+;CHECK: movq
+;CHECK: ret
+define void @ltstore(<4 x i32>* %pA, <2 x i32>* %pB) {
 entry:
-  %in = load <4 x i32>* %pIn
+  %in = load <4 x i32>* %pA
   %j = shufflevector <4 x i32> %in, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  store <2 x i32> %j, <2 x i32>* %pOut
+  store <2 x i32> %j, <2 x i32>* %pB
   ret void
 }
 
diff --git a/test/CodeGen/X86/SwizzleShuff.ll b/test/CodeGen/X86/SwizzleShuff.ll
index c60d9b899c..100817a676 100644
--- a/test/CodeGen/X86/SwizzleShuff.ll
+++ b/test/CodeGen/X86/SwizzleShuff.ll
@@ -41,3 +41,28 @@ define <4 x i8> @pull_bitcast2 (<4 x i8>* %pA, <4 x i8>* %pB, <4 x i8>* %pC) {
   store <4 x i8> %C, <4 x i8>* %pA
   ret <4 x i8> %C
 }
+
+
+
+; CHECK: reverse_1
+; CHECK-NOT: shuf
+; CHECK: ret
+define <4 x i32> @reverse_1 (<4 x i32>* %pA, <4 x i32>* %pB) {
+  %A = load <4 x i32>* %pA
+  %B = load <4 x i32>* %pB
+  %S = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %S1 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x i32> %S1
+}
+
+
+; CHECK: no_reverse_shuff
+; CHECK: shuf
+; CHECK: ret
+define <4 x i32> @no_reverse_shuff (<4 x i32>* %pA, <4 x i32>* %pB) {
+  %A = load <4 x i32>* %pA
+  %B = load <4 x i32>* %pB
+  %S = shufflevector <4 x i32> %A, <4 x i32> %B, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  %S1 = shufflevector <4 x i32> %S, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 3, i32 2>
+  ret <4 x i32> %S1
+}
diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll
index 946b126fd0..91777f7aa6 100644
--- a/test/CodeGen/X86/vec_compare-2.ll
+++ b/test/CodeGen/X86/vec_compare-2.ll
@@ -10,8 +10,10 @@ define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) {
 entry:
 ; CHECK: cfi_def_cfa_offset
 ; CHECK-NOT: set
-; CHECK: pcmpgt
-; CHECK: blendvps
+; CHECK: movzwl
+; CHECK: movzwl
+; CHECK: pshufd
+; CHECK: pshufb
   %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1]
   %cmp318.i = sext <4 x i1> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]
   %sub322.i = sub <4 x i32> %shr.i, zeroinitializer ; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/vec_shuffle-37.ll b/test/CodeGen/X86/vec_shuffle-37.ll
index 619652aff1..430aa046af 100644
--- a/test/CodeGen/X86/vec_shuffle-37.ll
+++ b/test/CodeGen/X86/vec_shuffle-37.ll
@@ -27,11 +27,11 @@ entry:
 define void @t02(<8 x i32>* %source, <2 x i32>* %dest) nounwind noinline {
 entry:
 ; CHECK: t02
-; CHECK: mov
-; CHECK-NEXT: mov
-; CHECK-NEXT: mov
-; CHECK-NEXT: mov
-; CHECK-NEXT: ret
+; CHECK: movaps
+; CHECK: shufps
+; CHECK: pshufd
+; CHECK: movq
+; CHECK: ret
   %0 = bitcast <8 x i32>* %source to <4 x i32>*
   %arrayidx = getelementptr inbounds <4 x i32>* %0, i64 3
   %tmp2 = load <4 x i32>* %arrayidx, align 16
diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll
index 9420053716..7bebb274f6 100644
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll
@@ -33,7 +33,7 @@ entry:
 define void @shuf3(<4 x float> %tmp10, <4 x float> %vecinit15, <4 x float>* %dst) nounwind {
 entry:
 ; CHECK: shuf3:
-; CHECK: shufd
+; CHECK: shufps
   %shuffle.i.i.i12 = shufflevector <4 x float> %tmp10, <4 x float> %vecinit15, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
   %tmp25.i.i = shufflevector <4 x float> %shuffle.i.i.i12, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2> 
   %tmp1.i.i = shufflevector <3 x float> %tmp25.i.i, <3 x float> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-- 
cgit v1.2.3