From eee7a7a8362afddab3fd9bf10b7023da7e7c42e5 Mon Sep 17 00:00:00 2001 From: Tim Northover Date: Fri, 13 Jun 2014 17:29:39 +0000 Subject: X86: lower ATOMIC_CMP_SWAP_WITH_SUCCESS directly Lowering this new node allows us to fold the almost universal comparison for success before it's even formed. Instead we can create a copy from EFLAGS and an X86ISD::SETCC operation since all "cmpxchg" instructions set the zero-flag to the correct value. rdar://problem/13201607 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@210923 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 43 +++++++++++++----- test/CodeGen/X86/cmpxchg-i1.ll | 87 +++++++++++++++++++++++++++++++++++++ test/CodeGen/X86/cmpxchg-i128-i1.ll | 83 +++++++++++++++++++++++++++++++++++ 3 files changed, 201 insertions(+), 12 deletions(-) create mode 100644 test/CodeGen/X86/cmpxchg-i1.ll create mode 100644 test/CodeGen/X86/cmpxchg-i128-i1.ll diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 38a0f06bc4..5e652e3182 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -580,7 +580,7 @@ void X86TargetLowering::resetOperationActions() { // Expand certain atomics for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; - setOperationAction(ISD::ATOMIC_CMP_SWAP, VT, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } @@ -601,7 +601,7 @@ void X86TargetLowering::resetOperationActions() { } if (Subtarget->hasCmpxchg16b()) { - setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom); + setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } // FIXME - use subtarget debug flags @@ -14529,7 +14529,7 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, break; } SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, - Op.getOperand(2), SDValue()); + Op.getOperand(2), SDValue()); SDValue Ops[] = { cpIn.getValue(0), Op.getOperand(1), Op.getOperand(3), @@ -14539,9 +14539,18 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget, MachineMemOperand *MMO = cast(Op)->getMemOperand(); SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, Ops, T, MMO); + SDValue cpOut = DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); - return cpOut; + SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, + MVT::i32, cpOut.getValue(2)); + SDValue Success = DAG.getNode(X86ISD::SETCC, DL, Op->getValueType(1), + DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); + + DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut); + DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success); + DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1)); + return SDValue(); } static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget, @@ -14721,7 +14730,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { default: llvm_unreachable("Should not custom lower this!"); case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op,DAG); case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); - case ISD::ATOMIC_CMP_SWAP: return LowerCMP_SWAP(Op, Subtarget, DAG); + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: + return LowerCMP_SWAP(Op, Subtarget, DAG); case ISD::ATOMIC_LOAD_SUB: return LowerLOAD_SUB(Op,DAG); case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op,DAG); case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); @@ -14803,8 +14813,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { } static void ReplaceATOMIC_LOAD(SDNode *Node, - SmallVectorImpl &Results, - SelectionDAG &DAG) { + SmallVectorImpl &Results, + SelectionDAG &DAG) { SDLoc dl(Node); EVT VT = cast(Node)->getMemoryVT(); @@ -14813,16 +14823,16 @@ static void ReplaceATOMIC_LOAD(SDNode *Node, // (The only way to get a 16-byte load is cmpxchg16b) // FIXME: 16-byte ATOMIC_CMP_SWAP isn't actually hooked up at the moment. SDValue Zero = DAG.getConstant(0, VT); - SDVTList VTs = DAG.getVTList(VT, MVT::Other); + SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other); SDValue Swap = - DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP, dl, VT, VTs, + DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, VT, VTs, Node->getOperand(0), Node->getOperand(1), Zero, Zero, cast(Node)->getMemOperand(), cast(Node)->getOrdering(), cast(Node)->getOrdering(), cast(Node)->getSynchScope()); Results.push_back(Swap.getValue(0)); - Results.push_back(Swap.getValue(1)); + Results.push_back(Swap.getValue(2)); } static void @@ -14938,7 +14948,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget, Results); } - case ISD::ATOMIC_CMP_SWAP: { + case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { EVT T = N->getValueType(0); assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair"); bool Regs64bit = T == MVT::i128; @@ -14980,8 +14990,17 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, Regs64bit ? X86::RDX : X86::EDX, HalfT, cpOutL.getValue(2)); SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; + + SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, + MVT::i32, cpOutH.getValue(2)); + SDValue Success = + DAG.getNode(X86ISD::SETCC, dl, MVT::i8, + DAG.getConstant(X86::COND_E, MVT::i8), EFLAGS); + Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); + Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); - Results.push_back(cpOutH.getValue(1)); + Results.push_back(Success); + Results.push_back(EFLAGS.getValue(1)); return; } case ISD::ATOMIC_LOAD_ADD: diff --git a/test/CodeGen/X86/cmpxchg-i1.ll b/test/CodeGen/X86/cmpxchg-i1.ll new file mode 100644 index 0000000000..a21ab593b0 --- /dev/null +++ b/test/CodeGen/X86/cmpxchg-i1.ll @@ -0,0 +1,87 @@ +; RUN: llc -mtriple=x86_64 -o - %s | FileCheck %s + +define i1 @try_cmpxchg(i32* %addr, i32 %desired, i32 %new) { +; CHECK-LABEL: try_cmpxchg: +; CHECK: cmpxchgl +; CHECK-NOT: cmp +; CHECK: sete %al +; CHECK: retq + %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst + %success = extractvalue { i32, i1 } %pair, 1 + ret i1 %success +} + +define void @cmpxchg_flow(i64* %addr, i64 %desired, i64 %new) { +; CHECK-LABEL: cmpxchg_flow: +; CHECK: cmpxchgq +; CHECK-NOT: cmp +; CHECK-NOT: set +; CHECK: {{jne|jeq}} + %pair = cmpxchg i64* %addr, i64 %desired, i64 %new seq_cst seq_cst + %success = extractvalue { i64, i1 } %pair, 1 + br i1 %success, label %true, label %false + +true: + call void @foo() + ret void + +false: + call void @bar() + ret void +} + +define i64 @cmpxchg_sext(i32* %addr, i32 %desired, i32 %new) { +; CHECK-LABEL: cmpxchg_sext: +; CHECK-DAG: cmpxchgl +; CHECK-NOT: cmpl +; CHECK: sete %al +; CHECK: retq + %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst + %success = extractvalue { i32, i1 } %pair, 1 + %mask = sext i1 %success to i64 + ret i64 %mask +} + +define i32 @cmpxchg_zext(i32* %addr, i32 %desired, i32 %new) { +; CHECK-LABEL: cmpxchg_zext: +; CHECK: cmpxchgl +; CHECK-NOT: cmp +; CHECK: sete [[BYTE:%[a-z0-9]+]] +; CHECK: movzbl [[BYTE]], %eax + %pair = cmpxchg i32* %addr, i32 %desired, i32 %new seq_cst seq_cst + %success = extractvalue { i32, i1 } %pair, 1 + %mask = zext i1 %success to i32 + ret i32 %mask +} + + +define i32 @cmpxchg_use_eflags_and_val(i32* %addr, i32 %offset) { +; CHECK-LABEL: cmpxchg_use_eflags_and_val: +; CHECK: movl (%rdi), %e[[OLDVAL:[a-z0-9]+]] + +; CHECK: [[LOOPBB:.?LBB[0-9]+_[0-9]+]]: +; CHECK: leal (%r[[OLDVAL]],%rsi), [[NEW:%[a-z0-9]+]] +; CHECK: cmpxchgl [[NEW]], (%rdi) +; CHECK-NOT: cmpl +; CHECK: jne [[LOOPBB]] + + ; Result already in %eax +; CHECK: retq +entry: + %init = load atomic i32* %addr seq_cst, align 4 + br label %loop + +loop: + %old = phi i32 [%init, %entry], [%oldval, %loop] + %new = add i32 %old, %offset + %pair = cmpxchg i32* %addr, i32 %old, i32 %new seq_cst seq_cst + %oldval = extractvalue { i32, i1 } %pair, 0 + %success = extractvalue { i32, i1 } %pair, 1 + br i1 %success, label %done, label %loop + +done: + ret i32 %oldval +} + +declare void @foo() +declare void @bar() diff --git a/test/CodeGen/X86/cmpxchg-i128-i1.ll b/test/CodeGen/X86/cmpxchg-i128-i1.ll new file mode 100644 index 0000000000..4dd30013ec --- /dev/null +++ b/test/CodeGen/X86/cmpxchg-i128-i1.ll @@ -0,0 +1,83 @@ +; RUN: llc -mcpu=core-avx2 -mtriple=x86_64 -o - %s | FileCheck %s + +define i1 @try_cmpxchg(i128* %addr, i128 %desired, i128 %new) { +; CHECK-LABEL: try_cmpxchg: +; CHECK: cmpxchg16b +; CHECK-NOT: cmp +; CHECK: sete %al +; CHECK: retq + %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst + %success = extractvalue { i128, i1 } %pair, 1 + ret i1 %success +} + +define void @cmpxchg_flow(i128* %addr, i128 %desired, i128 %new) { +; CHECK-LABEL: cmpxchg_flow: +; CHECK: cmpxchg16b +; CHECK-NOT: cmp +; CHECK-NOT: set +; CHECK: {{jne|jeq}} + %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst + %success = extractvalue { i128, i1 } %pair, 1 + br i1 %success, label %true, label %false + +true: + call void @foo() + ret void + +false: + call void @bar() + ret void +} + +; Can't use the flags here because cmpxchg16b only sets ZF. +define i1 @cmpxchg_arithcmp(i128* %addr, i128 %desired, i128 %new) { +; CHECK-LABEL: cmpxchg_arithcmp: +; CHECK: cmpxchg16b +; CHECK: cmpq +; CHECK: retq + %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst + %oldval = extractvalue { i128, i1 } %pair, 0 + %success = icmp sge i128 %oldval, %desired + ret i1 %success +} + +define i128 @cmpxchg_zext(i128* %addr, i128 %desired, i128 %new) { +; CHECK-LABEL: cmpxchg_zext: +; CHECK: cmpxchg16b +; CHECK-NOT: cmpq +; CHECK: sete [[BYTE:%[a-z0-9]+]] +; CHECK: movzbl [[BYTE]], %eax + %pair = cmpxchg i128* %addr, i128 %desired, i128 %new seq_cst seq_cst + %success = extractvalue { i128, i1 } %pair, 1 + %mask = zext i1 %success to i128 + ret i128 %mask +} + + +define i128 @cmpxchg_use_eflags_and_val(i128* %addr, i128 %offset) { +; CHECK-LABEL: cmpxchg_use_eflags_and_val: + +; CHECK: cmpxchg16b +; CHECK-NOT: cmpq +; CHECK: jne +entry: + %init = load atomic i128* %addr seq_cst, align 16 + br label %loop + +loop: + %old = phi i128 [%init, %entry], [%oldval, %loop] + %new = add i128 %old, %offset + + %pair = cmpxchg i128* %addr, i128 %old, i128 %new seq_cst seq_cst + %oldval = extractvalue { i128, i1 } %pair, 0 + %success = extractvalue { i128, i1 } %pair, 1 + + br i1 %success, label %done, label %loop + +done: + ret i128 %old +} + +declare void @foo() +declare void @bar() -- cgit v1.2.3