summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/llvm/Target/TargetInstrInfo.h11
-rw-r--r--lib/CodeGen/PeepholeOptimizer.cpp57
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp75
-rw-r--r--lib/Target/X86/X86InstrInfo.h8
-rw-r--r--test/CodeGen/X86/2012-05-19-avx2-store.ll3
-rw-r--r--test/CodeGen/X86/break-sse-dep.ll3
-rw-r--r--test/CodeGen/X86/fold-load.ll26
-rw-r--r--test/CodeGen/X86/fold-pcmpeqd-1.ll11
-rw-r--r--test/CodeGen/X86/sse-minmax.ll66
-rw-r--r--test/CodeGen/X86/vec_compare.ll6
10 files changed, 212 insertions, 54 deletions
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index 73efc507fa..cfb9dd7de2 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -14,6 +14,7 @@
#ifndef LLVM_TARGET_TARGETINSTRINFO_H
#define LLVM_TARGET_TARGETINSTRINFO_H
+#include "llvm/ADT/SmallSet.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/CodeGen/DFAPacketizer.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -693,6 +694,16 @@ public:
return false;
}
+ /// optimizeLoadInstr - Try to remove the load by folding it to a register
+ /// operand at the use. We fold the load instructions if and only if the
+ /// def and use are in the same BB.
+ virtual MachineInstr* optimizeLoadInstr(MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ unsigned &FoldAsLoadDefReg,
+ MachineInstr *&DefMI) const {
+ return 0;
+ }
+
/// FoldImmediate - 'Reg' is known to be defined by a move immediate
/// instruction, try to fold the immediate into the use instruction.
virtual bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 91c33c4af4..d9474bf240 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -78,6 +78,7 @@ STATISTIC(NumReuse, "Number of extension results reused");
STATISTIC(NumBitcasts, "Number of bitcasts eliminated");
STATISTIC(NumCmps, "Number of compares eliminated");
STATISTIC(NumImmFold, "Number of move immediate folded");
+STATISTIC(NumLoadFold, "Number of loads folded");
namespace {
class PeepholeOptimizer : public MachineFunctionPass {
@@ -114,6 +115,7 @@ namespace {
bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
SmallSet<unsigned, 4> &ImmDefRegs,
DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+ bool isLoadFoldable(MachineInstr *MI, unsigned &FoldAsLoadDefReg);
};
}
@@ -384,6 +386,29 @@ bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI,
return false;
}
+/// isLoadFoldable - Check whether MI is a candidate for folding into a later
+/// instruction. We only fold loads to virtual registers and the virtual
+/// register defined has a single use.
+bool PeepholeOptimizer::isLoadFoldable(MachineInstr *MI,
+ unsigned &FoldAsLoadDefReg) {
+ if (MI->canFoldAsLoad()) {
+ const MCInstrDesc &MCID = MI->getDesc();
+ if (MCID.getNumDefs() == 1) {
+ unsigned Reg = MI->getOperand(0).getReg();
+ // To reduce compilation time, we check MRI->hasOneUse when inserting
+ // loads. It should be checked when processing uses of the load, since
+ // uses can be removed during peephole.
+ if (!MI->getOperand(0).getSubReg() &&
+ TargetRegisterInfo::isVirtualRegister(Reg) &&
+ MRI->hasOneUse(Reg)) {
+ FoldAsLoadDefReg = Reg;
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
bool PeepholeOptimizer::isMoveImmediate(MachineInstr *MI,
SmallSet<unsigned, 4> &ImmDefRegs,
DenseMap<unsigned, MachineInstr*> &ImmDefMIs) {
@@ -441,6 +466,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
SmallPtrSet<MachineInstr*, 8> LocalMIs;
SmallSet<unsigned, 4> ImmDefRegs;
DenseMap<unsigned, MachineInstr*> ImmDefMIs;
+ unsigned FoldAsLoadDefReg;
for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
MachineBasicBlock *MBB = &*I;
@@ -448,6 +474,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
LocalMIs.clear();
ImmDefRegs.clear();
ImmDefMIs.clear();
+ FoldAsLoadDefReg = 0;
bool First = true;
MachineBasicBlock::iterator PMII;
@@ -456,12 +483,17 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
MachineInstr *MI = &*MII;
LocalMIs.insert(MI);
+ // If there exists an instruction which belongs to the following
+ // categories, we will discard the load candidate.
if (MI->isLabel() || MI->isPHI() || MI->isImplicitDef() ||
MI->isKill() || MI->isInlineAsm() || MI->isDebugValue() ||
MI->hasUnmodeledSideEffects()) {
+ FoldAsLoadDefReg = 0;
++MII;
continue;
}
+ if (MI->mayStore() || MI->isCall())
+ FoldAsLoadDefReg = 0;
if (MI->isBitcast()) {
if (optimizeBitcastInstr(MI, MBB)) {
@@ -489,6 +521,31 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
Changed |= foldImmediate(MI, MBB, ImmDefRegs, ImmDefMIs);
}
+ // Check whether MI is a load candidate for folding into a later
+ // instruction. If MI is not a candidate, check whether we can fold an
+ // earlier load into MI.
+ if (!isLoadFoldable(MI, FoldAsLoadDefReg) && FoldAsLoadDefReg) {
+ // We need to fold load after optimizeCmpInstr, since optimizeCmpInstr
+ // can enable folding by converting SUB to CMP.
+ MachineInstr *DefMI = 0;
+ MachineInstr *FoldMI = TII->optimizeLoadInstr(MI, MRI,
+ FoldAsLoadDefReg, DefMI);
+ if (FoldMI) {
+ // Update LocalMIs since we replaced MI with FoldMI and deleted DefMI.
+ LocalMIs.erase(MI);
+ LocalMIs.erase(DefMI);
+ LocalMIs.insert(FoldMI);
+ MI->eraseFromParent();
+ DefMI->eraseFromParent();
+ ++NumLoadFold;
+
+ // MI is replaced with FoldMI.
+ Changed = true;
+ PMII = FoldMI;
+ MII = llvm::next(PMII);
+ continue;
+ }
+ }
First = false;
PMII = MII;
++MII;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 3f16372497..ec793f8eb6 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -3323,6 +3323,81 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
return true;
}
+/// optimizeLoadInstr - Try to remove the load by folding it to a register
+/// operand at the use. We fold the load instructions if load defines a virtual
+/// register, the virtual register is used once in the same BB, and the
+/// instructions in-between do not load or store, and have no side effects.
+MachineInstr* X86InstrInfo::
+optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
+ unsigned &FoldAsLoadDefReg,
+ MachineInstr *&DefMI) const {
+ if (FoldAsLoadDefReg == 0)
+ return 0;
+ // To be conservative, if there exists another load, clear the load candidate.
+ if (MI->mayLoad()) {
+ FoldAsLoadDefReg = 0;
+ return 0;
+ }
+
+ // Check whether we can move DefMI here.
+ DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
+ assert(DefMI);
+ bool SawStore = false;
+ if (!DefMI->isSafeToMove(this, 0, SawStore))
+ return 0;
+
+ // We try to commute MI if possible.
+ unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1;
+ for (unsigned Idx = 0; Idx < IdxEnd; Idx++) {
+ // Collect information about virtual register operands of MI.
+ unsigned SrcOperandId = 0;
+ bool FoundSrcOperand = false;
+ for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (!MO.isReg())
+ continue;
+ unsigned Reg = MO.getReg();
+ if (Reg != FoldAsLoadDefReg)
+ continue;
+ // Do not fold if we have a subreg use or a def or multiple uses.
+ if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
+ return 0;
+
+ SrcOperandId = i;
+ FoundSrcOperand = true;
+ }
+ if (!FoundSrcOperand) return 0;
+
+ // Check whether we can fold the def into SrcOperandId.
+ SmallVector<unsigned, 8> Ops;
+ Ops.push_back(SrcOperandId);
+ MachineInstr *FoldMI = foldMemoryOperand(MI, Ops, DefMI);
+ if (FoldMI) {
+ FoldAsLoadDefReg = 0;
+ return FoldMI;
+ }
+
+ if (Idx == 1) {
+ // MI was changed but it didn't help, commute it back!
+ commuteInstruction(MI, false);
+ return 0;
+ }
+
+ // Check whether we can commute MI and enable folding.
+ if (MI->isCommutable()) {
+ MachineInstr *NewMI = commuteInstruction(MI, false);
+ // Unable to commute.
+ if (!NewMI) return 0;
+ if (NewMI != MI) {
+ // New instruction. It doesn't need to be kept.
+ NewMI->eraseFromParent();
+ return 0;
+ }
+ }
+ }
+ return 0;
+}
+
/// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
/// instruction with two undef reads of the register being defined. This is
/// used for mapping:
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index ec9b2e619d..9ed5210b22 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -387,6 +387,14 @@ public:
unsigned SrcReg2, int CmpMask, int CmpValue,
const MachineRegisterInfo *MRI) const;
+ /// optimizeLoadInstr - Try to remove the load by folding it to a register
+ /// operand at the use. We fold the load instructions if and only if the
+ /// def and use are in the same BB.
+ virtual MachineInstr* optimizeLoadInstr(MachineInstr *MI,
+ const MachineRegisterInfo *MRI,
+ unsigned &FoldAsLoadDefReg,
+ MachineInstr *&DefMI) const;
+
private:
MachineInstr * convertToThreeAddressWithLEA(unsigned MIOpc,
MachineFunction::iterator &MFI,
diff --git a/test/CodeGen/X86/2012-05-19-avx2-store.ll b/test/CodeGen/X86/2012-05-19-avx2-store.ll
index 61fef90139..1c1e8e2f0a 100644
--- a/test/CodeGen/X86/2012-05-19-avx2-store.ll
+++ b/test/CodeGen/X86/2012-05-19-avx2-store.ll
@@ -3,8 +3,7 @@
define void @double_save(<4 x i32>* %Ap, <4 x i32>* %Bp, <8 x i32>* %P) nounwind ssp {
entry:
; CHECK: vmovaps
- ; CHECK: vmovaps
- ; CHECK: vinsertf128
+ ; CHECK: vinsertf128 $1, ([[A0:%rdi|%rsi]]),
; CHECK: vmovups
%A = load <4 x i32>* %Ap
%B = load <4 x i32>* %Bp
diff --git a/test/CodeGen/X86/break-sse-dep.ll b/test/CodeGen/X86/break-sse-dep.ll
index 3e65867143..4d801891da 100644
--- a/test/CodeGen/X86/break-sse-dep.ll
+++ b/test/CodeGen/X86/break-sse-dep.ll
@@ -34,8 +34,7 @@ entry:
define double @squirt(double* %x) nounwind {
entry:
; CHECK: squirt:
-; CHECK: movsd ([[A0]]), %xmm0
-; CHECK: sqrtsd %xmm0, %xmm0
+; CHECK: sqrtsd ([[A0]]), %xmm0
%z = load double* %x
%t = call double @llvm.sqrt.f64(double %z)
ret double %t
diff --git a/test/CodeGen/X86/fold-load.ll b/test/CodeGen/X86/fold-load.ll
index e03cb7edb5..c961f7576f 100644
--- a/test/CodeGen/X86/fold-load.ll
+++ b/test/CodeGen/X86/fold-load.ll
@@ -45,3 +45,29 @@ L:
}
+; rdar://10554090
+; xor in exit block will be CSE'ed and load will be folded to xor in entry.
+define i1 @test3(i32* %P, i32* %Q) nounwind {
+; CHECK: test3:
+; CHECK: movl 8(%esp), %eax
+; CHECK: xorl (%eax),
+; CHECK: j
+; CHECK-NOT: xor
+entry:
+ %0 = load i32* %P, align 4
+ %1 = load i32* %Q, align 4
+ %2 = xor i32 %0, %1
+ %3 = and i32 %2, 65535
+ %4 = icmp eq i32 %3, 0
+ br i1 %4, label %exit, label %land.end
+
+exit:
+ %shr.i.i19 = xor i32 %1, %0
+ %5 = and i32 %shr.i.i19, 2147418112
+ %6 = icmp eq i32 %5, 0
+ br label %land.end
+
+land.end:
+ %7 = phi i1 [ %6, %exit ], [ false, %entry ]
+ ret i1 %7
+}
diff --git a/test/CodeGen/X86/fold-pcmpeqd-1.ll b/test/CodeGen/X86/fold-pcmpeqd-1.ll
index cc4198d7ca..a35dccddba 100644
--- a/test/CodeGen/X86/fold-pcmpeqd-1.ll
+++ b/test/CodeGen/X86/fold-pcmpeqd-1.ll
@@ -1,11 +1,14 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 > %t
-; RUN: grep pcmpeqd %t | count 1
-; RUN: grep xor %t | count 1
-; RUN: not grep LCP %t
+; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
define <2 x double> @foo() nounwind {
ret <2 x double> bitcast (<2 x i64><i64 -1, i64 -1> to <2 x double>)
+; CHECK: foo:
+; CHECK: pcmpeqd %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+; CHECK-NEXT: ret
}
define <2 x double> @bar() nounwind {
ret <2 x double> bitcast (<2 x i64><i64 0, i64 0> to <2 x double>)
+; CHECK: bar:
+; CHECK: xorps %xmm{{[0-9]+}}, %xmm{{[0-9]+}}
+; CHECK-NEXT: ret
}
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 4405f68451..5d3dbce1df 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86-64 -mcpu=nehalem -asm-verbose=false | FileCheck %s
-; RUN: llc < %s -march=x86-64 -mcpu=nehalem -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=UNSAFE %s
-; RUN: llc < %s -march=x86-64 -mcpu=nehalem -asm-verbose=false -enable-no-nans-fp-math | FileCheck -check-prefix=FINITE %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck -check-prefix=UNSAFE %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=nehalem -asm-verbose=false -enable-no-nans-fp-math | FileCheck -check-prefix=FINITE %s
; Some of these patterns can be matched as SSE min or max. Some of
; then can be matched provided that the operands are swapped.
@@ -137,16 +137,13 @@ define double @ole_inverse(double %x, double %y) nounwind {
}
; CHECK: ogt_x:
-; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; CHECK-NEXT: maxsd %xmm1, %xmm0
+; CHECK-NEXT: maxsd LCP{{.*}}(%rip), %xmm0
; CHECK-NEXT: ret
; UNSAFE: ogt_x:
-; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0
; UNSAFE-NEXT: ret
; FINITE: ogt_x:
-; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; FINITE-NEXT: maxsd %xmm1, %xmm0
+; FINITE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0
; FINITE-NEXT: ret
define double @ogt_x(double %x) nounwind {
%c = fcmp ogt double %x, 0.000000e+00
@@ -155,16 +152,13 @@ define double @ogt_x(double %x) nounwind {
}
; CHECK: olt_x:
-; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; CHECK-NEXT: minsd %xmm1, %xmm0
+; CHECK-NEXT: minsd LCP{{.*}}(%rip), %xmm0
; CHECK-NEXT: ret
; UNSAFE: olt_x:
-; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm1, %xmm0
+; UNSAFE-NEXT: minsd LCP{{.*}}(%rip), %xmm0
; UNSAFE-NEXT: ret
; FINITE: olt_x:
-; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; FINITE-NEXT: minsd %xmm1, %xmm0
+; FINITE-NEXT: minsd LCP{{.*}}(%rip), %xmm0
; FINITE-NEXT: ret
define double @olt_x(double %x) nounwind {
%c = fcmp olt double %x, 0.000000e+00
@@ -217,12 +211,10 @@ define double @olt_inverse_x(double %x) nounwind {
; CHECK: oge_x:
; CHECK: ucomisd %xmm1, %xmm0
; UNSAFE: oge_x:
-; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0
; UNSAFE-NEXT: ret
; FINITE: oge_x:
-; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; FINITE-NEXT: maxsd %xmm1, %xmm0
+; FINITE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0
; FINITE-NEXT: ret
define double @oge_x(double %x) nounwind {
%c = fcmp oge double %x, 0.000000e+00
@@ -233,12 +225,10 @@ define double @oge_x(double %x) nounwind {
; CHECK: ole_x:
; CHECK: ucomisd %xmm0, %xmm1
; UNSAFE: ole_x:
-; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm1, %xmm0
+; UNSAFE-NEXT: minsd LCP{{.*}}(%rip), %xmm0
; UNSAFE-NEXT: ret
; FINITE: ole_x:
-; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; FINITE-NEXT: minsd %xmm1, %xmm0
+; FINITE-NEXT: minsd LCP{{.*}}(%rip), %xmm0
; FINITE-NEXT: ret
define double @ole_x(double %x) nounwind {
%c = fcmp ole double %x, 0.000000e+00
@@ -411,12 +401,10 @@ define double @ule_inverse(double %x, double %y) nounwind {
; CHECK: ugt_x:
; CHECK: ucomisd %xmm0, %xmm1
; UNSAFE: ugt_x:
-; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0
; UNSAFE-NEXT: ret
; FINITE: ugt_x:
-; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; FINITE-NEXT: maxsd %xmm1, %xmm0
+; FINITE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0
; FINITE-NEXT: ret
define double @ugt_x(double %x) nounwind {
%c = fcmp ugt double %x, 0.000000e+00
@@ -427,12 +415,10 @@ define double @ugt_x(double %x) nounwind {
; CHECK: ult_x:
; CHECK: ucomisd %xmm1, %xmm0
; UNSAFE: ult_x:
-; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm1, %xmm0
+; UNSAFE-NEXT: minsd LCP{{.*}}(%rip), %xmm0
; UNSAFE-NEXT: ret
; FINITE: ult_x:
-; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; FINITE-NEXT: minsd %xmm1, %xmm0
+; FINITE-NEXT: minsd LCP{{.*}}(%rip), %xmm0
; FINITE-NEXT: ret
define double @ult_x(double %x) nounwind {
%c = fcmp ult double %x, 0.000000e+00
@@ -482,12 +468,10 @@ define double @ult_inverse_x(double %x) nounwind {
; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
; CHECK-NEXT: ret
; UNSAFE: uge_x:
-; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: maxsd %xmm1, %xmm0
+; UNSAFE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0
; UNSAFE-NEXT: ret
; FINITE: uge_x:
-; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; FINITE-NEXT: maxsd %xmm1, %xmm0
+; FINITE-NEXT: maxsd LCP{{.*}}(%rip), %xmm0
; FINITE-NEXT: ret
define double @uge_x(double %x) nounwind {
%c = fcmp uge double %x, 0.000000e+00
@@ -501,12 +485,10 @@ define double @uge_x(double %x) nounwind {
; CHECK-NEXT: movap{{[sd]}} %xmm1, %xmm0
; CHECK-NEXT: ret
; UNSAFE: ule_x:
-; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; UNSAFE-NEXT: minsd %xmm1, %xmm0
+; UNSAFE-NEXT: minsd LCP{{.*}}(%rip), %xmm0
; UNSAFE-NEXT: ret
; FINITE: ule_x:
-; FINITE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; FINITE-NEXT: minsd %xmm1, %xmm0
+; FINITE-NEXT: minsd LCP{{.*}}(%rip), %xmm0
; FINITE-NEXT: ret
define double @ule_x(double %x) nounwind {
%c = fcmp ule double %x, 0.000000e+00
@@ -515,8 +497,7 @@ define double @ule_x(double %x) nounwind {
}
; CHECK: uge_inverse_x:
-; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; CHECK-NEXT: minsd %xmm1, %xmm0
+; CHECK-NEXT: minsd LCP{{.*}}(%rip), %xmm0
; CHECK-NEXT: ret
; UNSAFE: uge_inverse_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
@@ -535,8 +516,7 @@ define double @uge_inverse_x(double %x) nounwind {
}
; CHECK: ule_inverse_x:
-; CHECK-NEXT: xorp{{[sd]}} %xmm1, %xmm1
-; CHECK-NEXT: maxsd %xmm1, %xmm0
+; CHECK-NEXT: maxsd LCP{{.*}}(%rip), %xmm0
; CHECK-NEXT: ret
; UNSAFE: ule_inverse_x:
; UNSAFE-NEXT: xorp{{[sd]}} %xmm1, %xmm1
diff --git a/test/CodeGen/X86/vec_compare.ll b/test/CodeGen/X86/vec_compare.ll
index 39c9b770d5..1e04f19ee8 100644
--- a/test/CodeGen/X86/vec_compare.ll
+++ b/test/CodeGen/X86/vec_compare.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=yonah -mtriple=i386-apple-darwin | FileCheck %s
define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
@@ -14,8 +14,8 @@ define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) nounwind {
define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) nounwind {
; CHECK: test2:
; CHECK: pcmp
-; CHECK: pcmp
-; CHECK: pxor
+; CHECK: pxor LCP
+; CHECK: movdqa
; CHECK: ret
%C = icmp sge <4 x i32> %A, %B
%D = sext <4 x i1> %C to <4 x i32>