From 35a6a814073e9ce8c0722d23788eae2eb86832d9 Mon Sep 17 00:00:00 2001 From: Juergen Ributzka Date: Tue, 24 Jun 2014 23:51:21 +0000 Subject: [FastISel][X86] Fold XALU condition into branch and compare. Optimize the codegen of select and branch instructions to directly use the EFLAGS from the {s|u}{add|sub|mul}.with.overflow intrinsics. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@211645 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86FastISel.cpp | 150 ++++++++++++++++++ test/CodeGen/X86/xaluo.ll | 350 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 498 insertions(+), 2 deletions(-) diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 6625a706d2..17999a59aa 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -1398,6 +1398,84 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { return true; } } + } else if (auto *EV = dyn_cast(BI->getCondition())) { + bool FoldIntrinsic = false; + if (const auto *II = dyn_cast(EV->getAggregateOperand())) { + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: FoldIntrinsic = true; break; + } + + // Check if both instructions are in the same basic block. + if (FoldIntrinsic && (II->getParent() != I->getParent())) + FoldIntrinsic = false; + + // Make sure nothing is in the way + if (FoldIntrinsic) { + BasicBlock::const_iterator Start = I; + BasicBlock::const_iterator End = II; + for (auto Itr = std::prev(Start); Itr != End; --Itr) { + // We only expect extractvalue instructions between the intrinsic and + // the branch. + if (!isa(Itr)) { + FoldIntrinsic = false; + break; + } + + // Check that the extractvalue operand comes from the intrinsic. + const auto *EVI = cast(Itr); + if (EVI->getAggregateOperand() != II) { + FoldIntrinsic = false; + break; + } + } + } + } + + if (FoldIntrinsic) { + MVT RetVT; + const IntrinsicInst *II = cast(EV->getAggregateOperand()); + const Function *Callee = II->getCalledFunction(); + Type *RetTy = + cast(Callee->getReturnType())->getTypeAtIndex(0U); + if (!isTypeLegal(RetTy, RetVT)) + return false; + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return false; + + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(EV); + if (TmpReg == 0) + return false; + + unsigned BranchOpc = 0; + switch (II->getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic instruction."); + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: BranchOpc = X86::JO_4; break; + case Intrinsic::uadd_with_overflow: + case Intrinsic::usub_with_overflow: BranchOpc = X86::JB_4; break; + } + + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(BranchOpc)) + .addMBB(TrueMBB); + FastEmitBranch(FalseMBB, DbgLoc); + uint32_t BranchWeight = 0; + if (FuncInfo.BPI) + BranchWeight = FuncInfo.BPI->getEdgeWeight(BI->getParent(), + TrueMBB->getBasicBlock()); + FuncInfo.MBB->addSuccessor(TrueMBB, BranchWeight); + return true; + } } // Otherwise do a clumsy setcc and re-test it. @@ -1732,6 +1810,78 @@ bool X86FastISel::X86FastEmitCMoveSelect(const Instruction *I) { } } NeedTest = false; + } else if (auto *EV = dyn_cast(Cond)) { + bool FoldIntrinsic = false; + if (const auto *II = dyn_cast(EV->getAggregateOperand())) { + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::sadd_with_overflow: + case Intrinsic::uadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::usub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: FoldIntrinsic = true; break; + } + + // Check if both instructions are in the same basic block. + if (FoldIntrinsic && (II->getParent() != I->getParent())) + FoldIntrinsic = false; + + // Make sure nothing is in the way + if (FoldIntrinsic) { + BasicBlock::const_iterator Start = I; + BasicBlock::const_iterator End = II; + for (auto Itr = std::prev(Start); Itr != End; --Itr) { + // We only expect extractvalue instructions between the intrinsic and + // the branch. + if (!isa(Itr)) { + FoldIntrinsic = false; + break; + } + + // Check that the extractvalue operand comes from the intrinsic. + const auto *EVI = cast(Itr); + if (EVI->getAggregateOperand() != II) { + FoldIntrinsic = false; + break; + } + } + } + } + + if (FoldIntrinsic) { + MVT RetVT; + const IntrinsicInst *II = cast(EV->getAggregateOperand()); + const Function *Callee = II->getCalledFunction(); + Type *RetTy = + cast(Callee->getReturnType())->getTypeAtIndex(0U); + if (!isTypeLegal(RetTy, RetVT)) + return false; + + if (RetVT != MVT::i32 && RetVT != MVT::i64) + return false; + + // Fake request the condition, otherwise the intrinsic might be completely + // optimized away. + unsigned TmpReg = getRegForValue(EV); + if (TmpReg == 0) + return false; + + switch (II->getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic instruction."); + case Intrinsic::sadd_with_overflow: + case Intrinsic::ssub_with_overflow: + case Intrinsic::smul_with_overflow: + case Intrinsic::umul_with_overflow: + Opc = X86::getCMovFromCond(X86::COND_O, RC->getSize()); + break; + case Intrinsic::uadd_with_overflow: + case Intrinsic::usub_with_overflow: + Opc = X86::getCMovFromCond(X86::COND_B, RC->getSize()); + break; + } + NeedTest = false; + } } if (NeedTest) { diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll index a9aa8fa96a..c236f80936 100644 --- a/test/CodeGen/X86/xaluo.ll +++ b/test/CodeGen/X86/xaluo.ll @@ -1,5 +1,7 @@ -; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s -check-prefix=DAG -; RUN: llc -mtriple=x86_64-unknown-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST +; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s --check-prefix=DAG +; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s --check-prefix=FAST +; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort < %s | FileCheck %s ; ; Get the actual value of the overflow bit. @@ -320,6 +322,349 @@ entry: ret i1 %obit } +; +; Check the use of the overflow bit in combination with a select instruction. +; +define i32 @saddo.select.i32(i32 %v1, i32 %v2) { +entry: +; CHECK-LABEL: saddo.select.i32 +; CHECK: addl %esi, %eax +; CHECK-NEXT: cmovol %edi, %esi + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = select i1 %obit, i32 %v1, i32 %v2 + ret i32 %ret +} + +define i64 @saddo.select.i64(i64 %v1, i64 %v2) { +entry: +; CHECK-LABEL: saddo.select.i64 +; CHECK: addq %rsi, %rax +; CHECK-NEXT: cmovoq %rdi, %rsi + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = select i1 %obit, i64 %v1, i64 %v2 + ret i64 %ret +} + +define i32 @uaddo.select.i32(i32 %v1, i32 %v2) { +entry: +; CHECK-LABEL: uaddo.select.i32 +; CHECK: addl %esi, %eax +; CHECK-NEXT: cmovbl %edi, %esi + %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = select i1 %obit, i32 %v1, i32 %v2 + ret i32 %ret +} + +define i64 @uaddo.select.i64(i64 %v1, i64 %v2) { +entry: +; CHECK-LABEL: uaddo.select.i64 +; CHECK: addq %rsi, %rax +; CHECK-NEXT: cmovbq %rdi, %rsi + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = select i1 %obit, i64 %v1, i64 %v2 + ret i64 %ret +} + +define i32 @ssubo.select.i32(i32 %v1, i32 %v2) { +entry: +; CHECK-LABEL: ssubo.select.i32 +; CHECK: cmpl %esi, %edi +; CHECK-NEXT: cmovol %edi, %esi + %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = select i1 %obit, i32 %v1, i32 %v2 + ret i32 %ret +} + +define i64 @ssubo.select.i64(i64 %v1, i64 %v2) { +entry: +; CHECK-LABEL: ssubo.select.i64 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovoq %rdi, %rsi + %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = select i1 %obit, i64 %v1, i64 %v2 + ret i64 %ret +} + +define i32 @usubo.select.i32(i32 %v1, i32 %v2) { +entry: +; CHECK-LABEL: usubo.select.i32 +; CHECK: cmpl %esi, %edi +; CHECK-NEXT: cmovbl %edi, %esi + %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = select i1 %obit, i32 %v1, i32 %v2 + ret i32 %ret +} + +define i64 @usubo.select.i64(i64 %v1, i64 %v2) { +entry: +; CHECK-LABEL: usubo.select.i64 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: cmovbq %rdi, %rsi + %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = select i1 %obit, i64 %v1, i64 %v2 + ret i64 %ret +} + +define i32 @smulo.select.i32(i32 %v1, i32 %v2) { +entry: +; CHECK-LABEL: smulo.select.i32 +; CHECK: imull %esi, %eax +; CHECK-NEXT: cmovol %edi, %esi + %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = select i1 %obit, i32 %v1, i32 %v2 + ret i32 %ret +} + +define i64 @smulo.select.i64(i64 %v1, i64 %v2) { +entry: +; CHECK-LABEL: smulo.select.i64 +; CHECK: imulq %rsi, %rax +; CHECK-NEXT: cmovoq %rdi, %rsi + %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = select i1 %obit, i64 %v1, i64 %v2 + ret i64 %ret +} + +define i32 @umulo.select.i32(i32 %v1, i32 %v2) { +entry: +; CHECK-LABEL: umulo.select.i32 +; CHECK: mull %esi +; CHECK-NEXT: cmovol %edi, %esi + %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) + %obit = extractvalue {i32, i1} %t, 1 + %ret = select i1 %obit, i32 %v1, i32 %v2 + ret i32 %ret +} + +define i64 @umulo.select.i64(i64 %v1, i64 %v2) { +entry: +; CHECK-LABEL: umulo.select.i64 +; CHECK: mulq %rsi +; CHECK-NEXT: cmovoq %rdi, %rsi + %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) + %obit = extractvalue {i64, i1} %t, 1 + %ret = select i1 %obit, i64 %v1, i64 %v2 + ret i64 %ret +} + + +; +; Check the use of the overflow bit in combination with a branch instruction. +; +define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) { +entry: +; CHECK-LABEL: saddo.br.i32 +; CHECK: addl %esi, %edi +; CHECK-NEXT: jo + %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %continue, !prof !0 + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) { +entry: +; CHECK-LABEL: saddo.br.i64 +; CHECK: addq %rsi, %rdi +; CHECK-NEXT: jo + %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue, !prof !0 + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) { +entry: +; CHECK-LABEL: uaddo.br.i32 +; CHECK: addl %esi, %edi +; CHECK-NEXT: jb + %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %continue, !prof !0 + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) { +entry: +; CHECK-LABEL: uaddo.br.i64 +; CHECK: addq %rsi, %rdi +; CHECK-NEXT: jb + %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue, !prof !0 + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) { +entry: +; CHECK-LABEL: ssubo.br.i32 +; CHECK: cmpl %esi, %edi +; CHECK-NEXT: jo + %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %continue, !prof !0 + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) { +entry: +; CHECK-LABEL: ssubo.br.i64 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jo + %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue, !prof !0 + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) { +entry: +; CHECK-LABEL: usubo.br.i32 +; CHECK: cmpl %esi, %edi +; CHECK-NEXT: jb + %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %continue, !prof !0 + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) { +entry: +; CHECK-LABEL: usubo.br.i64 +; CHECK: cmpq %rsi, %rdi +; CHECK-NEXT: jb + %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue, !prof !0 + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) { +entry: +; CHECK-LABEL: smulo.br.i32 +; CHECK: imull %esi, %edi +; CHECK-NEXT: jo + %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %continue, !prof !0 + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) { +entry: +; CHECK-LABEL: smulo.br.i64 +; CHECK: imulq %rsi, %rdi +; CHECK-NEXT: jo + %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue, !prof !0 + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) { +entry: +; CHECK-LABEL: umulo.br.i32 +; CHECK: mull %esi +; CHECK-NEXT: jo + %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2) + %val = extractvalue {i32, i1} %t, 0 + %obit = extractvalue {i32, i1} %t, 1 + br i1 %obit, label %overflow, label %continue, !prof !0 + +overflow: + ret i1 false + +continue: + ret i1 true +} + +define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) { +entry: +; CHECK-LABEL: umulo.br.i64 +; CHECK: mulq %rsi +; CHECK-NEXT: jo + %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2) + %val = extractvalue {i64, i1} %t, 0 + %obit = extractvalue {i64, i1} %t, 1 + br i1 %obit, label %overflow, label %continue, !prof !0 + +overflow: + ret i1 false + +continue: + ret i1 true +} + declare {i8, i1} @llvm.sadd.with.overflow.i8(i8, i8) nounwind readnone declare {i16, i1} @llvm.sadd.with.overflow.i16(i16, i16) nounwind readnone declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone @@ -335,3 +680,4 @@ declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone +!0 = metadata !{metadata !"branch_weights", i32 0, i32 2147483647} -- cgit v1.2.3