From eee7a7a8362afddab3fd9bf10b7023da7e7c42e5 Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Fri, 13 Jun 2014 17:29:39 +0000
Subject: X86: lower ATOMIC_CMP_SWAP_WITH_SUCCESS directly

Lowering this new node allows us to fold the almost universal
comparison for success before it's even formed. Instead we can create
a copy from EFLAGS and an X86ISD::SETCC operation since all "cmpxchg"
instructions set the zero-flag to the correct value.

rdar://problem/13201607

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@210923 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Target/X86/X86ISelLowering.cpp  | 43 +++++++++++++-----
 test/CodeGen/X86/cmpxchg-i1.ll      | 87 +++++++++++++++++++++++++++++++++++++
 test/CodeGen/X86/cmpxchg-i128-i1.ll | 83 +++++++++++++++++++++++++++++++++++
 3 files changed, 201 insertions(+), 12 deletions(-)
 create mode 100644 test/CodeGen/X86/cmpxchg-i1.ll
 create mode 100644 test/CodeGen/X86/cmpxchg-i128-i1.ll
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 38a0f06bc4..5e652e3182 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -580,7 +580,7 @@ void X86TargetLowering::resetOperationActions() {
   // Expand certain atomics
   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
     MVT VT = IntVTs[i];
-    setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
   }
@@ -601,7 +601,7 @@ void X86TargetLowering::resetOperationActions() {
   }
 
   if (Subtarget->hasCmpxchg16b()) {
-    setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
   }
 
   // FIXME - use subtarget debug flags
@@ -14529,7 +14529,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
     break;
   }
   SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
-                                    Op.getOperand(2), SDValue());
+                                  Op.getOperand(2), SDValue());
   SDValue Ops[] = { cpIn.getValue(0),
                     Op.getOperand(1),
                     Op.getOperand(3),
@@ -14539,9 +14539,18 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
                                            Ops, T, MMO);
+
   SDValue cpOut =
     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
-  return cpOut;
+  SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
+                                      MVT::i32, cpOut.getValue(2));
+  SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1),
+                                DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
+
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
+  DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
+  return SDValue();
 }
 
 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
@@ -14721,7 +14730,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   default: llvm_unreachable("Should not custom lower this!");
   case ISD::SIGN_EXTEND_INREG:  return LowerSIGN_EXTEND_INREG(Op,DAG);
   case ISD::ATOMIC_FENCE:       return LowerATOMIC_FENCE(Op, Subtarget, DAG);
-  case ISD::ATOMIC_CMP_SWAP:    return LowerCMP_SWAP(Op, Subtarget, DAG);
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+    return LowerCMP_SWAP(Op, Subtarget, DAG);
   case ISD::ATOMIC_LOAD_SUB:    return LowerLOAD_SUB(Op,DAG);
   case ISD::ATOMIC_STORE:       return LowerATOMIC_STORE(Op,DAG);
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
@@ -14803,8 +14813,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 }
 
 static void ReplaceATOMIC_LOAD(SDNode *Node,
-                                  SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) {
+                               SmallVectorImpl<SDValue> &Results,
+                               SelectionDAG &DAG) {
   SDLoc dl(Node);
   EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
 
@@ -14813,16 +14823,16 @@ static void ReplaceATOMIC_LOAD(SDNode *Node,
   //        (The only way to get a 16-byte load is cmpxchg16b)
   // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment.
   SDValue Zero = DAG.getConstant(0, VT);
-  SDVTList VTs = DAG.getVTList(VT, MVT::Other);
+  SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
   SDValue Swap =
-      DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP, dl, VT, VTs,
+      DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs,
                            Node->getOperand(0), Node->getOperand(1), Zero, Zero,
                            cast<AtomicSDNode>(Node)->getMemOperand(),
                            cast<AtomicSDNode>(Node)->getOrdering(),
                            cast<AtomicSDNode>(Node)->getOrdering(),
                            cast<AtomicSDNode>(Node)->getSynchScope());
   Results.push_back(Swap.getValue(0));
-  Results.push_back(Swap.getValue(1));
+  Results.push_back(Swap.getValue(2));
 }
 
 static void
@@ -14938,7 +14948,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
                                    Results);
   }
-  case ISD::ATOMIC_CMP_SWAP: {
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
     EVT T = N->getValueType(0);
     assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
     bool Regs64bit = T == MVT::i128;
@@ -14980,8 +14990,17 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                         Regs64bit ? X86::RDX : X86::EDX,
                                         HalfT, cpOutL.getValue(2));
     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
+
+    SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
+                                        MVT::i32, cpOutH.getValue(2));
+    SDValue Success =
+        DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+                    DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS);
+    Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
+
     Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
-    Results.push_back(cpOutH.getValue(1));
+    Results.push_back(Success);
+    Results.push_back(EFLAGS.getValue(1));
     return;
   }
   case ISD::ATOMIC_LOAD_ADD:
diff --git a/test/CodeGen/X86/cmpxchg-i1.ll b/test/CodeGen/X86/cmpxchg-i1.ll
new file mode 100644
index 0000000000..a21ab593b0
--- /dev/null
+++ b/test/CodeGen/X86/cmpxchg-i1.ll
@@ -0,0 +1,87 @@
+; RUN: llc -mtriple=x86_64 -o - %s | FileCheck %s
+
+define i1 @try_cmpxchg(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: try_cmpxchg:
+; CHECK: cmpxchgl
+; CHECK-NOT: cmp
+; CHECK: sete %al
+; CHECK: retq
+  %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  ret i1 %success
+}
+
+define void @cmpxchg_flow(i64* %addr, i64 %desired, i64 %new) {
+; CHECK-LABEL: cmpxchg_flow:
+; CHECK: cmpxchgq
+; CHECK-NOT: cmp
+; CHECK-NOT: set
+; CHECK: {{jne|jeq}}
+  %pair = cmpxchg i64* %addr, i64 %desired, i64 %new seq_cst seq_cst
+  %success = extractvalue { i64, i1 } %pair, 1
+  br i1 %success, label %true, label %false
+
+true:
+  call void @foo()
+  ret void
+
+false:
+  call void @bar()
+  ret void
+}
+
+define i64 @cmpxchg_sext(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: cmpxchg_sext:
+; CHECK-DAG: cmpxchgl
+; CHECK-NOT: cmpl
+; CHECK: sete %al
+; CHECK: retq
+  %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  %mask = sext i1 %success to i64
+  ret i64 %mask
+}
+
+define i32 @cmpxchg_zext(i32* %addr, i32 %desired, i32 %new) {
+; CHECK-LABEL: cmpxchg_zext:
+; CHECK: cmpxchgl
+; CHECK-NOT: cmp
+; CHECK: sete [[BYTE:%[a-z0-9]+]]
+; CHECK: movzbl [[BYTE]], %eax
+  %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst
+  %success = extractvalue { i32, i1 } %pair, 1
+  %mask = zext i1 %success to i32
+  ret i32 %mask
+}
+
+
+define i32 @cmpxchg_use_eflags_and_val(i32* %addr, i32 %offset) {
+; CHECK-LABEL: cmpxchg_use_eflags_and_val:
+; CHECK: movl (%rdi), %e[[OLDVAL:[a-z0-9]+]]
+
+; CHECK: [[LOOPBB:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: leal (%r[[OLDVAL]],%rsi), [[NEW:%[a-z0-9]+]]
+; CHECK: cmpxchgl [[NEW]], (%rdi)
+; CHECK-NOT: cmpl
+; CHECK: jne [[LOOPBB]]
+
+  ; Result already in %eax
+; CHECK: retq
+entry:
+  %init = load atomic i32* %addr seq_cst, align 4
+  br label %loop
+
+loop:
+  %old = phi i32 [%init, %entry], [%oldval, %loop]
+  %new = add i32 %old, %offset
+  %pair = cmpxchg i32* %addr, i32 %old, i32 %new seq_cst seq_cst
+  %oldval = extractvalue { i32, i1 } %pair, 0
+  %success = extractvalue { i32, i1 } %pair, 1
+  br i1 %success, label %done, label %loop
+
+done:
+  ret i32 %oldval
+}
+
+declare void @foo()
+declare void @bar()
diff --git a/test/CodeGen/X86/cmpxchg-i128-i1.ll b/test/CodeGen/X86/cmpxchg-i128-i1.ll
new file mode 100644
index 0000000000..4dd30013ec
--- /dev/null
+++ b/test/CodeGen/X86/cmpxchg-i128-i1.ll
@@ -0,0 +1,83 @@
+; RUN: llc -mcpu=core-avx2 -mtriple=x86_64 -o - %s | FileCheck %s
+
+define i1 @try_cmpxchg(i128* %addr, i128 %desired, i128 %new) {
+; CHECK-LABEL: try_cmpxchg:
+; CHECK: cmpxchg16b
+; CHECK-NOT: cmp
+; CHECK: sete %al
+; CHECK: retq
+  %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
+  %success = extractvalue { i128, i1 } %pair, 1
+  ret i1 %success
+}
+
+define void @cmpxchg_flow(i128* %addr, i128 %desired, i128 %new) {
+; CHECK-LABEL: cmpxchg_flow:
+; CHECK: cmpxchg16b
+; CHECK-NOT: cmp
+; CHECK-NOT: set
+; CHECK: {{jne|jeq}}
+  %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
+  %success = extractvalue { i128, i1 } %pair, 1
+  br i1 %success, label %true, label %false
+
+true:
+  call void @foo()
+  ret void
+
+false:
+  call void @bar()
+  ret void
+}
+
+; Can't use the flags here because cmpxchg16b only sets ZF.
+define i1 @cmpxchg_arithcmp(i128* %addr, i128 %desired, i128 %new) {
+; CHECK-LABEL: cmpxchg_arithcmp:
+; CHECK: cmpxchg16b
+; CHECK: cmpq
+; CHECK: retq
+  %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
+  %oldval = extractvalue { i128, i1 } %pair, 0
+  %success = icmp sge i128 %oldval, %desired
+  ret i1 %success
+}
+
+define i128 @cmpxchg_zext(i128* %addr, i128 %desired, i128 %new) {
+; CHECK-LABEL: cmpxchg_zext:
+; CHECK: cmpxchg16b
+; CHECK-NOT: cmpq
+; CHECK: sete [[BYTE:%[a-z0-9]+]]
+; CHECK: movzbl [[BYTE]], %eax
+  %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst
+  %success = extractvalue { i128, i1 } %pair, 1
+  %mask = zext i1 %success to i128
+  ret i128 %mask
+}
+
+
+define i128 @cmpxchg_use_eflags_and_val(i128* %addr, i128 %offset) {
+; CHECK-LABEL: cmpxchg_use_eflags_and_val:
+
+; CHECK: cmpxchg16b
+; CHECK-NOT: cmpq
+; CHECK: jne
+entry:
+  %init = load atomic i128* %addr seq_cst, align 16
+  br label %loop
+
+loop:
+  %old = phi i128 [%init, %entry], [%oldval, %loop]
+  %new = add i128 %old, %offset
+
+  %pair = cmpxchg i128* %addr, i128 %old, i128 %new seq_cst seq_cst
+  %oldval = extractvalue { i128, i1 } %pair, 0
+  %success = extractvalue { i128, i1 } %pair, 1
+
+  br i1 %success, label %done, label %loop
+
+done:
+  ret i128 %old
+}
+
+declare void @foo()
+declare void @bar()
-- 
cgit v1.2.3