Implement support for using modeling implicit-zero-extension on x86-64

with SUBREG_TO_REG, teach SimpleRegisterCoalescing to coalesce SUBREG_TO_REG instructions (which are similar to INSERT_SUBREG instructions), and teach the DAGCombiner to take advantage of this on targets which support it. This eliminates many redundant zero-extension operations on x86-64. This adds a new TargetLowering hook, isZExtFree. It's similar to isTruncateFree, except it only applies to actual definitions, and not no-op truncates which may not zero the high bits. Also, this adds a new optimization to SimplifyDemandedBits: transform operations like x+y into (zext (add (trunc x), (trunc y))) on targets where all the casts are no-ops. In contexts where the high part of the add is explicitly masked off, this allows the mask operation to be eliminated. Fix the DAGCombiner to avoid undoing these transformations to eliminate casts on targets where the casts are no-ops. Also, this adds a new two-address lowering heuristic. Since two-address lowering runs before coalescing, it helps to be able to look through copies when deciding whether commuting and/or three-address conversion are profitable. Also, fix a bug in LiveInterval::MergeInClobberRanges. It didn't handle the case that a clobber range extended both before and beyond an existing live range. In that case, multiple live ranges need to be added. This was exposed by the new subreg coalescing code. Remove 2008-05-06-SpillerBug.ll. It was bugpoint-reduced, and the spiller behavior it was looking for no longer occurrs with the new instruction selection. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@68576 91177308-0d34-0410-b5e6-96231b3b80d8
author: Dan Gohman <gohman@apple.com> 2009-04-08 00:15:30 +0000
committer: Dan Gohman <gohman@apple.com> 2009-04-08 00:15:30 +0000
commit: 97121ba2afb8d566ff1bf5c4e8fc5d4077940a7f (patch)
tree: b530d1fd94181f009f5d7ff1d760c88336a67def
parent: a49a671efe24aa6e2e9e2280cf714ef97a40f177 (diff)
download: llvm-97121ba2afb8d566ff1bf5c4e8fc5d4077940a7f.tar.gz
llvm-97121ba2afb8d566ff1bf5c4e8fc5d4077940a7f.tar.bz2
llvm-97121ba2afb8d566ff1bf5c4e8fc5d4077940a7f.tar.xz
19 files changed, 502 insertions, 107 deletions
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index cbe4ca4f53..477505e2f7 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -747,6 +747,13 @@ public:
     /// there are any bits set in the constant that are not demanded.  If so,
     /// shrink the constant and return true.
     bool ShrinkDemandedConstant(SDValue Op, const APInt &Demanded);
+
+    /// ShrinkDemandedOp - Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the
+    /// casts are free.  This uses isZExtFree and ZERO_EXTEND for the widening
+    /// cast, but it could be generalized for targets with other types of
+    /// implicit widening casts.
+    bool ShrinkDemandedOp(SDValue Op, unsigned BitWidth, const APInt &Demanded,
+                          DebugLoc dl);
   };
                                                 
   /// SimplifyDemandedBits - Look at Op.  At this point, we know that only the
@@ -1386,7 +1393,23 @@ public:
   virtual bool isTruncateFree(MVT VT1, MVT VT2) const {
     return false;
   }
-  
+
+  /// isZExtFree - Return true if any actual instruction that defines a
+  /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+  /// register. This does not necessarily include registers defined in
+  /// unknown ways, such as incoming arguments, or copies from unknown
+  /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+  /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+  /// all instructions that define 32-bit values implicit zero-extend the
+  /// result out to 64 bits.
+  virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const {
+    return false;
+  }
+
+  virtual bool isZExtFree(MVT VT1, MVT VT2) const {
+    return false;
+  }
+
   //===--------------------------------------------------------------------===//
   // Div utility functions
   //
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 75cd36dec2..3f8714085a 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -579,24 +579,41 @@ void LiveInterval::MergeInClobberRanges(const LiveInterval &Clobbers,
   
   iterator IP = begin();
   for (const_iterator I = Clobbers.begin(), E = Clobbers.end(); I != E; ++I) {
+    bool Done = false;
     unsigned Start = I->start, End = I->end;
-    IP = std::upper_bound(IP, end(), Start);
-    
-    // If the start of this range overlaps with an existing liverange, trim it.
-    if (IP != begin() && IP[-1].end > Start) {
-      Start = IP[-1].end;
-      // Trimmed away the whole range?
-      if (Start >= End) continue;
-    }
-    // If the end of this range overlaps with an existing liverange, trim it.
-    if (IP != end() && End > IP->start) {
-      End = IP->start;
-      // If this trimmed away the whole range, ignore it.
-      if (Start == End) continue;
+    // If a clobber range starts before an existing range and ends after
+    // it, the clobber range will need to be split into multiple ranges.
+    // Loop until the entire clobber range is handled.
+    while (!Done) {
+      Done = true;
+      IP = std::upper_bound(IP, end(), Start);
+      unsigned SubRangeStart = Start;
+      unsigned SubRangeEnd = End;
+
+      // If the start of this range overlaps with an existing liverange, trim it.
+      if (IP != begin() && IP[-1].end > SubRangeStart) {
+        SubRangeStart = IP[-1].end;
+        // Trimmed away the whole range?
+        if (SubRangeStart >= SubRangeEnd) continue;
+      }
+      // If the end of this range overlaps with an existing liverange, trim it.
+      if (IP != end() && SubRangeEnd > IP->start) {
+        // If the clobber live range extends beyond the existing live range,
+        // it'll need at least another live range, so set the flag to keep
+        // iterating.
+        if (SubRangeEnd > IP->end) {
+          Start = IP->end;
+          Done = false;
+        }
+        SubRangeEnd = IP->start;
+        // If this trimmed away the whole range, ignore it.
+        if (SubRangeStart == SubRangeEnd) continue;
+      }
+
+      // Insert the clobber interval.
+      IP = addRangeFrom(LiveRange(SubRangeStart, SubRangeEnd, ClobberValNo),
+                        IP);
     }
-    
-    // Insert the clobber interval.
-    IP = addRangeFrom(LiveRange(Start, End, ClobberValNo), IP);
   }
 }
 
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 8c7fa1b1ac..cb08fe759f 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -399,6 +399,7 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
     unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
     if (mi->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG ||
         mi->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+        mi->getOpcode() == TargetInstrInfo::SUBREG_TO_REG ||
         tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg))
       CopyMI = mi;
     // Earlyclobbers move back one.
@@ -556,6 +557,7 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
       if (mi->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG ||
           mi->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+          mi->getOpcode() == TargetInstrInfo::SUBREG_TO_REG ||
           tii_->isMoveInstr(*mi, SrcReg, DstReg, SrcSubReg, DstSubReg))
         CopyMI = mi;
       ValNo = interval.getNextValue(defIndex, CopyMI, VNInfoAllocator);
@@ -658,6 +660,7 @@ void LiveIntervals::handleRegisterDef(MachineBasicBlock *MBB,
     unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
     if (MI->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG ||
         MI->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+        MI->getOpcode() == TargetInstrInfo::SUBREG_TO_REG ||
         tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubReg, DstSubReg))
       CopyMI = MI;
     handlePhysicalRegisterDef(MBB, MI, MIIdx, MO, 
@@ -855,7 +858,8 @@ unsigned LiveIntervals::getVNInfoSourceReg(const VNInfo *VNI) const {
     if (TargetRegisterInfo::isPhysicalRegister(Reg))
       Reg = tri_->getSubReg(Reg, VNI->copy->getOperand(2).getImm());
     return Reg;
-  } else if (VNI->copy->getOpcode() == TargetInstrInfo::INSERT_SUBREG)
+  } else if (VNI->copy->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+             VNI->copy->getOpcode() == TargetInstrInfo::SUBREG_TO_REG)
     return VNI->copy->getOperand(2).getReg();
 
   unsigned SrcReg, DstReg, SrcSubReg, DstSubReg;
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 78d5d403e8..e874f1b1ec 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1666,9 +1666,11 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // fold (OP (zext x), (zext y)) -> (zext (OP x, y))
   // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
   // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
-  // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y))
+  // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
   if ((N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND||
-       N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::TRUNCATE) &&
+       N0.getOpcode() == ISD::SIGN_EXTEND ||
+       (N0.getOpcode() == ISD::TRUNCATE &&
+        !TLI.isTruncateFree(N0.getOperand(0).getValueType(), VT))) &&
       N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) {
     SDValue ORNode = DAG.getNode(N->getOpcode(), N0.getDebugLoc(),
                                  N0.getOperand(0).getValueType(),
@@ -3121,10 +3123,14 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     return DAG.getZeroExtendInReg(Op, N->getDebugLoc(), N0.getValueType());
   }
 
-  // fold (zext (and (trunc x), cst)) -> (and x, cst).
+  // Fold (zext (and (trunc x), cst)) -> (and x, cst),
+  // if either of the casts is not free.
   if (N0.getOpcode() == ISD::AND &&
       N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
-      N0.getOperand(1).getOpcode() == ISD::Constant) {
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
+                           N0.getValueType()) ||
+       !TLI.isZExtFree(N0.getValueType(), VT))) {
     SDValue X = N0.getOperand(0).getOperand(0);
     if (X.getValueType().bitsLT(VT)) {
       X = DAG.getNode(ISD::ANY_EXTEND, X.getDebugLoc(), VT, X);
@@ -3252,10 +3258,13 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
     return DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, TruncOp);
   }
 
-  // fold (aext (and (trunc x), cst)) -> (and x, cst).
+  // Fold (aext (and (trunc x), cst)) -> (and x, cst)
+  // if the trunc is not free.
   if (N0.getOpcode() == ISD::AND &&
       N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
-      N0.getOperand(1).getOpcode() == ISD::Constant) {
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
+                          N0.getValueType())) {
     SDValue X = N0.getOperand(0).getOperand(0);
     if (X.getValueType().bitsLT(VT)) {
       X = DAG.getNode(ISD::ANY_EXTEND, N->getDebugLoc(), VT, X);
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 79a48a6fd9..7b83a12707 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -777,6 +777,48 @@ bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op,
   return false;
 }
 
+/// ShrinkDemandedOp - Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the
+/// casts are free.  This uses isZExtFree and ZERO_EXTEND for the widening
+/// cast, but it could be generalized for targets with other types of
+/// implicit widening casts.
+bool
+TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
+                                                    unsigned BitWidth,
+                                                    const APInt &Demanded,
+                                                    DebugLoc dl) {
+  assert(Op.getNumOperands() == 2 &&
+         "ShrinkDemandedOp only supports binary operators!");
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "ShrinkDemandedOp only supports nodes with one result!");
+
+  // Don't do this if the node has another user, which may require the
+  // full value.
+  if (!Op.getNode()->hasOneUse())
+    return false;
+
+  // Search for the smallest integer type with free casts to and from
+  // Op's type. For expedience, just check power-of-2 integer types.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned SmallVTBits = BitWidth - Demanded.countLeadingZeros();
+  if (!isPowerOf2_32(SmallVTBits))
+    SmallVTBits = NextPowerOf2(SmallVTBits);
+  for (; SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
+    MVT SmallVT = MVT::getIntegerVT(SmallVTBits);
+    if (TLI.isTruncateFree(Op.getValueType(), SmallVT) &&
+        TLI.isZExtFree(SmallVT, Op.getValueType())) {
+      // We found a type with free casts.
+      SDValue X = DAG.getNode(Op.getOpcode(), dl, SmallVT,
+                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
+                                          Op.getNode()->getOperand(0)),
+                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
+                                          Op.getNode()->getOperand(1)));
+      SDValue Z = DAG.getNode(ISD::ZERO_EXTEND, dl, Op.getValueType(), X);
+      return CombineTo(Op, Z);
+    }
+  }
+  return false;
+}
+
 /// SimplifyDemandedBits - Look at Op.  At this point, we know that only the
 /// DemandedMask bits of the result of Op are ever used downstream.  If we can
 /// use this information to simplify Op, create a new simplified DAG node and
@@ -865,7 +907,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // If the RHS is a constant, see if we can simplify it.
     if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask))
       return true;
-      
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
     // Output known-1 bits are only known if set in both the LHS & RHS.
     KnownOne &= KnownOne2;
     // Output known-0 are known to be clear if zero in either the LHS | RHS.
@@ -896,7 +941,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     // If the RHS is a constant, see if we can simplify it.
     if (TLO.ShrinkDemandedConstant(Op, NewMask))
       return true;
-          
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
     // Output known-0 bits are only known if clear in both the LHS & RHS.
     KnownZero &= KnownZero2;
     // Output known-1 are known to be set if set in either the LHS | RHS.
@@ -918,7 +966,10 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       return TLO.CombineTo(Op, Op.getOperand(0));
     if ((KnownZero2 & NewMask) == NewMask)
       return TLO.CombineTo(Op, Op.getOperand(1));
-      
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
     // If all of the unknown bits are known to be zero on one side or the other
     // (but not both) turn this into an *inclusive* or.
     //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
@@ -1333,6 +1384,24 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     }
 #endif
     break;
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::SUB: {
+    // Add, Sub, and Mul don't demand any bits in positions beyond that
+    // of the highest bit demanded of them.
+    APInt LoMask = APInt::getLowBitsSet(BitWidth,
+                                        BitWidth - NewMask.countLeadingZeros());
+    if (SimplifyDemandedBits(Op.getOperand(0), LoMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(1), LoMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    // See if the operation should be performed at a smaller bit width.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+  }
+  // FALL THROUGH
   default:
     // Just use ComputeMaskedBits to compute output bits.
     TLO.DAG.ComputeMaskedBits(Op, NewMask, KnownZero, KnownOne, Depth);
diff --git a/lib/CodeGen/SimpleRegisterCoalescing.cpp b/lib/CodeGen/SimpleRegisterCoalescing.cpp
index f60b23a6c5..60f7f403a0 100644
--- a/lib/CodeGen/SimpleRegisterCoalescing.cpp
+++ b/lib/CodeGen/SimpleRegisterCoalescing.cpp
@@ -1070,7 +1070,8 @@ SimpleRegisterCoalescing::HasIncompatibleSubRegDefUse(MachineInstr *CopyMI,
           return true;
       }
     }
-    if (MI->getOpcode() == TargetInstrInfo::INSERT_SUBREG) {
+    if (MI->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+        MI->getOpcode() == TargetInstrInfo::SUBREG_TO_REG) {
       SubIdx = MI->getOperand(3).getImm();
       if (VirtReg == MI->getOperand(0).getReg()) {
         if (!tri_->getSubReg(PhysReg, SubIdx))
@@ -1164,11 +1165,12 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
   unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
   bool isExtSubReg = CopyMI->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG;
   bool isInsSubReg = CopyMI->getOpcode() == TargetInstrInfo::INSERT_SUBREG;
+  bool isSubRegToReg = CopyMI->getOpcode() == TargetInstrInfo::SUBREG_TO_REG;
   unsigned SubIdx = 0;
   if (isExtSubReg) {
     DstReg = CopyMI->getOperand(0).getReg();
     SrcReg = CopyMI->getOperand(1).getReg();
-  } else if (isInsSubReg) {
+  } else if (isInsSubReg || isSubRegToReg) {
     if (CopyMI->getOperand(2).getSubReg()) {
       DOUT << "\tSource of insert_subreg is already coalesced "
            << "to another register.\n";
@@ -1212,7 +1214,7 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
   MachineBasicBlock *CopyMBB = CopyMI->getParent();
   unsigned RealDstReg = 0;
   unsigned RealSrcReg = 0;
-  if (isExtSubReg || isInsSubReg) {
+  if (isExtSubReg || isInsSubReg || isSubRegToReg) {
     SubIdx = CopyMI->getOperand(isExtSubReg ? 2 : 3).getImm();
     if (SrcIsPhys && isExtSubReg) {
       // r1024 = EXTRACT_SUBREG EAX, 0 then r1024 is really going to be
@@ -1228,7 +1230,7 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
       } else
         SrcReg = tri_->getSubReg(SrcReg, SubIdx);
       SubIdx = 0;
-    } else if (DstIsPhys && isInsSubReg) {
+    } else if (DstIsPhys && (isInsSubReg || isSubRegToReg)) {
       // EAX = INSERT_SUBREG EAX, r1024, 0
       unsigned SrcSubIdx = CopyMI->getOperand(2).getSubReg();
       if (SrcSubIdx) {
@@ -1241,8 +1243,9 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
       } else
         DstReg = tri_->getSubReg(DstReg, SubIdx);
       SubIdx = 0;
-    } else if ((DstIsPhys && isExtSubReg) || (SrcIsPhys && isInsSubReg)) {
-      if (CopyMI->getOperand(1).getSubReg()) {
+    } else if ((DstIsPhys && isExtSubReg) ||
+               (SrcIsPhys && (isInsSubReg || isSubRegToReg))) {
+      if (!isSubRegToReg && CopyMI->getOperand(1).getSubReg()) {
         DOUT << "\tSrc of extract_subreg already coalesced with reg"
              << " of a super-class.\n";
         return false; // Not coalescable.
@@ -1295,20 +1298,32 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
 
     // Process moves where one of the registers have a sub-register index.
     MachineOperand *DstMO = CopyMI->findRegisterDefOperand(DstReg);
-    if (DstMO->getSubReg())
-      // FIXME: Can we handle this?
-      return false;
     MachineOperand *SrcMO = CopyMI->findRegisterUseOperand(SrcReg);
-    SubIdx = SrcMO->getSubReg();
+    SubIdx = DstMO->getSubReg();
     if (SubIdx) {
-      // This is not a extract_subreg but it looks like one.
-      // e.g. %cl = MOV16rr %reg1024:2
-      isExtSubReg = true;
-      if (DstIsPhys) {
-        if (!CanJoinExtractSubRegToPhysReg(DstReg, SrcReg, SubIdx,RealDstReg))
+      if (SrcMO->getSubReg())
+        // FIXME: can we handle this?
+        return false;
+      // This is not an insert_subreg but it looks like one.
+      // e.g. %reg1024:3 = MOV32rr %EAX
+      isInsSubReg = true;
+      if (SrcIsPhys) {
+        if (!CanJoinInsertSubRegToPhysReg(DstReg, SrcReg, SubIdx, RealSrcReg))
           return false; // Not coalescable
         SubIdx = 0;
       }
+    } else {
+      SubIdx = SrcMO->getSubReg();
+      if (SubIdx) {
+        // This is not a extract_subreg but it looks like one.
+        // e.g. %cl = MOV16rr %reg1024:2
+        isExtSubReg = true;
+        if (DstIsPhys) {
+          if (!CanJoinExtractSubRegToPhysReg(DstReg, SrcReg, SubIdx,RealDstReg))
+            return false; // Not coalescable
+          SubIdx = 0;
+        }
+      }
     }
 
     const TargetRegisterClass *SrcRC= SrcIsPhys ? 0 : mri_->getRegClass(SrcReg);
@@ -1393,7 +1408,7 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
     SavedLI = li_->dupInterval(&DstInt);
 
   // Check if it is necessary to propagate "isDead" property.
-  if (!isExtSubReg && !isInsSubReg) {
+  if (!isExtSubReg && !isInsSubReg && !isSubRegToReg) {
     MachineOperand *mopd = CopyMI->findRegisterDefOperand(DstReg, false);
     bool isDead = mopd->isDead();
 
@@ -1446,12 +1461,12 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
 
     // If definition of source is defined by trivial computation, try
     // rematerializing it.
-    if (!isExtSubReg && !isInsSubReg &&
+    if (!isExtSubReg && !isInsSubReg && !isSubRegToReg &&
         ReMaterializeTrivialDef(SrcInt, DstInt.reg, CopyMI))
       return true;
     
     // If we can eliminate the copy without merging the live ranges, do so now.
-    if (!isExtSubReg && !isInsSubReg &&
+    if (!isExtSubReg && !isInsSubReg && !isSubRegToReg &&
         (AdjustCopiesBackFrom(SrcInt, DstInt, CopyMI) ||
          RemoveCopyByCommutingDef(SrcInt, DstInt, CopyMI))) {
       JoinedCopies.insert(CopyMI);
@@ -1505,8 +1520,10 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
 
   // If this is a EXTRACT_SUBREG, make sure the result of coalescing is the
   // larger super-register.
-  if ((isExtSubReg || isInsSubReg) && !SrcIsPhys && !DstIsPhys) {
-    if ((isExtSubReg && !Swapped) || (isInsSubReg && Swapped)) {
+  if ((isExtSubReg || isInsSubReg || isSubRegToReg) &&
+      !SrcIsPhys && !DstIsPhys) {
+    if ((isExtSubReg && !Swapped) ||
+        ((isInsSubReg || isSubRegToReg) && Swapped)) {
       ResSrcInt->Copy(*ResDstInt, li_->getVNInfoAllocator());
       std::swap(SrcReg, DstReg);
       std::swap(ResSrcInt, ResDstInt);
@@ -1594,7 +1611,7 @@ bool SimpleRegisterCoalescing::JoinCopy(CopyRec &TheCopy, bool &Again) {
 
   // If resulting interval has a preference that no longer fits because of subreg
   // coalescing, just clear the preference.
-  if (ResDstInt->preference && (isExtSubReg || isInsSubReg) &&
+  if (ResDstInt->preference && (isExtSubReg || isInsSubReg || isSubRegToReg) &&
       TargetRegisterInfo::isVirtualRegister(ResDstInt->reg)) {
     const TargetRegisterClass *RC = mri_->getRegClass(ResDstInt->reg);
     if (!RC->contains(ResDstInt->preference))
@@ -1847,7 +1864,13 @@ bool SimpleRegisterCoalescing::SimpleJoin(LiveInterval &LHS, LiveInterval &RHS){
   LHS.weight += RHS.weight;
   if (RHS.preference && !LHS.preference)
     LHS.preference = RHS.preference;
-  
+
+  // Update the liveintervals of sub-registers.
+  if (TargetRegisterInfo::isPhysicalRegister(LHS.reg))
+    for (const unsigned *AS = tri_->getSubRegisters(LHS.reg); *AS; ++AS)
+      li_->getOrCreateInterval(*AS).MergeInClobberRanges(LHS,
+                                                    li_->getVNInfoAllocator());
+
   return true;
 }
 
@@ -2183,7 +2206,8 @@ void SimpleRegisterCoalescing::CopyCoalesceInMBB(MachineBasicBlock *MBB,
     if (Inst->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG) {
       DstReg = Inst->getOperand(0).getReg();
       SrcReg = Inst->getOperand(1).getReg();
-    } else if (Inst->getOpcode() == TargetInstrInfo::INSERT_SUBREG) {
+    } else if (Inst->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+               Inst->getOpcode() == TargetInstrInfo::SUBREG_TO_REG) {
       DstReg = Inst->getOperand(0).getReg();
       SrcReg = Inst->getOperand(2).getReg();
     } else if (!tii_->isMoveInstr(*Inst, SrcReg, DstReg, SrcSubIdx, DstSubIdx))
@@ -2498,7 +2522,8 @@ bool SimpleRegisterCoalescing::runOnMachineFunction(MachineFunction &fn) {
         // Delete all coalesced copies.
         if (!tii_->isMoveInstr(*MI, SrcReg, DstReg, SrcSubIdx, DstSubIdx)) {
           assert((MI->getOpcode() == TargetInstrInfo::EXTRACT_SUBREG ||
-                  MI->getOpcode() == TargetInstrInfo::INSERT_SUBREG) &&
+                  MI->getOpcode() == TargetInstrInfo::INSERT_SUBREG ||
+                  MI->getOpcode() == TargetInstrInfo::SUBREG_TO_REG) &&
                  "Unrecognized copy instruction");
           DstReg = MI->getOperand(0).getReg();
         }
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index e8ae988f0c..8aa866ea29 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -177,7 +177,7 @@ bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
     break;
   }
 
-  if (!KillMI || KillMI->getParent() != MBB)
+  if (!KillMI || KillMI->getParent() != MBB || KillMI == MI)
     return false;
 
   // If any of the definitions are used by another instruction between the
@@ -326,6 +326,9 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
     } else if (MI.getOpcode() == TargetInstrInfo::INSERT_SUBREG) {
       DstReg = MI.getOperand(0).getReg();
       SrcReg = MI.getOperand(2).getReg();
+    } else if (MI.getOpcode() == TargetInstrInfo::SUBREG_TO_REG) {
+      DstReg = MI.getOperand(0).getReg();
+      SrcReg = MI.getOperand(2).getReg();
     }
   }
 
@@ -337,6 +340,46 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
   return false;
 }
 
+/// isKilled - Test if the given register value, which is used by the given
+/// instruction, is killed by the given instruction. This looks through
+/// coalescable copies to see if the original value is potentially not killed.
+///
+/// For example, in this code:
+///
+///   %reg1034 = copy %reg1024
+///   %reg1035 = copy %reg1025<kill>
+///   %reg1036 = add %reg1034<kill>, %reg1035<kill>
+///
+/// %reg1034 is not considered to be killed, since it is copied from a
+/// register which is not killed. Treating it as not killed lets the
+/// normal heuristics commute the (two-address) add, which lets
+/// coalescing eliminate the extra copy.
+///
+static bool isKilled(MachineInstr &MI, unsigned Reg,
+                     const MachineRegisterInfo *MRI,
+                     const TargetInstrInfo *TII) {
+  MachineInstr *DefMI = &MI;
+  for (;;) {
+    if (!DefMI->killsRegister(Reg))
+      return false;
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      return true;
+    MachineRegisterInfo::def_iterator Begin = MRI->def_begin(Reg);
+    // If there are multiple defs, we can't do a simple analysis, so just
+    // go with what the kill flag says.
+    if (next(Begin) != MRI->def_end())
+      return true;
+    DefMI = &*Begin;
+    bool IsSrcPhys, IsDstPhys;
+    unsigned SrcReg,  DstReg;
+    // If the def is something other than a copy, then it isn't going to
+    // be coalesced, so follow the kill flag.
+    if (!isCopyToReg(*DefMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys))
+      return true;
+    Reg = SrcReg;
+  }
+}
+
 /// isTwoAddrUse - Return true if the specified MI uses the specified register
 /// as a two-address use. If so, return the destination register by reference.
 static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
@@ -735,7 +778,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
           // rearrange the code to make it so.  Making it the killing user will
           // allow us to coalesce A and B together, eliminating the copy we are
           // about to insert.
-          if (!mi->killsRegister(regB)) {
+          if (!isKilled(*mi, regB, MRI, TII)) {
             // If regA is dead and the instruction can be deleted, just delete
             // it so it doesn't clobber regB.
             if (mi->getOperand(ti).isDead() && isSafeToDelete(mi, TII)) {
@@ -753,7 +796,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
               assert(mi->getOperand(3-si).isReg() &&
                      "Not a proper commutative instruction!");
               unsigned regC = mi->getOperand(3-si).getReg();
-              if (mi->killsRegister(regC)) {
+              if (isKilled(*mi, regC, MRI, TII)) {
                 if (CommuteInstruction(mi, mbbi, regB, regC, Dist)) {
                   ++NumCommuted;
                   regB = regC;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index c5a6acbf7a..6bdb92fbcb 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -7258,6 +7258,16 @@ bool X86TargetLowering::isTruncateFree(MVT VT1, MVT VT2) const {
   return Subtarget->is64Bit() || NumBits1 < 64;
 }
 
+bool X86TargetLowering::isZExtFree(const Type *Ty1, const Type *Ty2) const {
+  // x86-64 has implicitly zero-extends 32-bit results in 64-bit registers.
+  return Ty1 == Type::Int32Ty && Ty2 == Type::Int64Ty && Subtarget->is64Bit();
+}
+
+bool X86TargetLowering::isZExtFree(MVT VT1, MVT VT2) const {
+  // x86-64 has implicitly zero-extends 32-bit results in 64-bit registers.
+  return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget->is64Bit();
+}
+
 /// isShuffleMaskLegal - Targets can use this to indicate that they only
 /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index ca4af63422..45b3e974ae 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -458,7 +458,18 @@ namespace llvm {
     /// register EAX to i16 by referencing its sub-register AX.
     virtual bool isTruncateFree(const Type *Ty1, const Type *Ty2) const;
     virtual bool isTruncateFree(MVT VT1, MVT VT2) const;
-  
+
+    /// isZExtFree - Return true if any actual instruction that defines a
+    /// value of type Ty1 implicit zero-extends the value to Ty2 in the result
+    /// register. This does not necessarily include registers defined in
+    /// unknown ways, such as incoming arguments, or copies from unknown
+    /// virtual registers. Also, if isTruncateFree(Ty2, Ty1) is true, this
+    /// does not necessarily apply to truncate instructions. e.g. on x86-64,
+    /// all instructions that define 32-bit values implicit zero-extend the
+    /// result out to 64 bits.
+    virtual bool isZExtFree(const Type *Ty1, const Type *Ty2) const;
+    virtual bool isZExtFree(MVT VT1, MVT VT2) const;
+
     /// isShuffleMaskLegal - Targets can use this to indicate that they only
     /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
     /// By default, if a target supports the VECTOR_SHUFFLE node, all mask
diff --git a/lib/Target/X86/X86Instr64bit.td b/lib/Target/X86/X86Instr64bit.td
index ce5a8a3703..f88834e4a2 100644
--- a/lib/Target/X86/X86Instr64bit.td
+++ b/lib/Target/X86/X86Instr64bit.td
@@ -292,10 +292,12 @@ def MOVZX64rm16: I<0xB7, MRMSrcMem, (outs GR64:$dst), (ins i16mem:$src),
                    [(set GR64:$dst, (zextloadi64i16 addr:$src))]>, TB;
 
 // There's no movzlq instruction, but movl can be used for this purpose, using
-// implicit zero-extension. We need this because the seeming alternative for
-// implementing zext from 32 to 64, an EXTRACT_SUBREG/SUBREG_TO_REG pair, isn't
-// safe because both instructions could be optimized away in the
-// register-to-register case, leaving nothing behind to do the zero extension.
+// implicit zero-extension. The preferred way to do 32-bit-to-64-bit zero
+// extension on x86-64 is to use a SUBREG_TO_REG to utilize implicit
+// zero-extension, however this isn't possible when the 32-bit value is
+// defined by a truncate or is copied from something where the high bits aren't
+// necessarily all zero. In such cases, we fall back to these explicit zext
+// instructions.
 def MOVZX64rr32 : I<0x89, MRMDestReg, (outs GR64:$dst), (ins GR32:$src),
                     "mov{l}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
                     [(set GR64:$dst, (zext GR32:$src))]>;
@@ -303,6 +305,21 @@ def MOVZX64rm32 : I<0x8B, MRMSrcMem, (outs GR64:$dst), (ins i32mem:$src),
                     "mov{l}\t{$src, ${dst:subreg32}|${dst:subreg32}, $src}",
                     [(set GR64:$dst, (zextloadi64i32 addr:$src))]>;
 
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG, and CopyFromReg may
+// be copying from a truncate, but any other 32-bit operation will zero-extend
+// up to 64 bits.
+def def32 : PatLeaf<(i32 GR32:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetInstrInfo::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg;
+}]>;
+
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)),
+          (SUBREG_TO_REG (i64 0), GR32:$src, x86_subreg_32bit)>;
+
 let neverHasSideEffects = 1 in {
   let Defs = [RAX], Uses = [EAX] in
   def CDQE : RI<0x98, RawFrm, (outs), (ins),
@@ -1443,10 +1460,6 @@ def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_O, EFLAGS),
 def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, X86_COND_NO, EFLAGS),
           (CMOVO64rm GR64:$src2, addr:$src1)>;
 
-// Zero-extension
-def : Pat<(i64 (zext GR32:$src)), 
-          (SUBREG_TO_REG (i64 0), GR32:$src, x86_subreg_32bit)>;
-
 // zextload bool -> zextload byte
 def : Pat<(zextloadi64i1 addr:$src), (MOVZX64rm8 addr:$src)>;
 
diff --git a/test/CodeGen/X86/2008-05-06-SpillerBug.ll b/test/CodeGen/X86/2008-05-06-SpillerBug.ll
deleted file mode 100644
index e13f398062..0000000000
--- a/test/CodeGen/X86/2008-05-06-SpillerBug.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; RUN: llvm-as < %s | llc -mtriple=i386-apple-darwin -relocation-model=pic -disable-fp-elim | grep addb | grep ebp
-
-	%struct.rc4_state = type { i32, i32, [256 x i32] }
-@.str1 = internal constant [65 x i8] c"m[%d] = 0x%02x, m[%d] = 0x%02x, 0x%02x, k = %d, key[k] = 0x%02x\0A\00"		; <[65 x i8]*> [#uses=1]
-@keys = internal constant [7 x [30 x i8]] [ [30 x i8] c"\08\01#Eg\89\AB\CD\EF\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", [30 x i8] c"\08\01#Eg\89\AB\CD\EF\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", [30 x i8] c"\08\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", [30 x i8] c"\04\EF\01#E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", [30 x i8] c"\08\01#Eg\89\AB\CD\EF\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", [30 x i8] c"\04\EF\01#E\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", [30 x i8] zeroinitializer ]		; <[7 x [30 x i8]]*> [#uses=1]
-
-declare i32 @printf(i8*, ...) nounwind 
-
-define i32 @main(i32 %argc, i8** %argv) nounwind  {
-entry:
-	br label %bb25
-
-bb25:		; preds = %bb25, %entry
-  %foo = phi i1 [ 0, %bb25], [ 1, %entry]
-	br i1 %foo, label %bb.i, label %bb25
-
-bb.i:		; preds = %bb.i, %bb25
-  %foo2 = phi i1 [ 0, %bb.i], [1, %bb25]
-	br i1 %foo2, label %bb21.i, label %bb.i
-
-bb21.i:		; preds = %bb21.i, %bb.i
-	%k.0.reg2mem.0.i = phi i32 [ %k.1.i, %bb21.i ], [ 0, %bb.i ]		; <i32> [#uses=2]
-	%j.0.reg2mem.0.i = phi i8 [ %tmp35.i, %bb21.i ], [ 0, %bb.i ]		; <i8> [#uses=1]
-	%tmp25.i = load i32* null, align 4		; <i32> [#uses=4]
-	%tmp2829.i = trunc i32 %tmp25.i to i8		; <i8> [#uses=1]
-	%.sum = add i32 %k.0.reg2mem.0.i, 1		; <i32> [#uses=3]
-	%tmp33.i = getelementptr [7 x [30 x i8]]* @keys, i32 0, i32 0, i32 %.sum		; <i8*> [#uses=1]
-	%tmp34.i = load i8* %tmp33.i, align 1		; <i8> [#uses=1]
-	%tmp30.i = add i8 %tmp2829.i, %j.0.reg2mem.0.i		; <i8> [#uses=1]
-	%tmp35.i = add i8 %tmp30.i, %tmp34.i		; <i8> [#uses=2]
-	%tmp3536.i = zext i8 %tmp35.i to i32		; <i32> [#uses=2]
-	%tmp39.i = getelementptr %struct.rc4_state* null, i32 0, i32 2, i32 %tmp3536.i		; <i32*> [#uses=1]
-	store i32 %tmp25.i, i32* %tmp39.i, align 4
-	%tmp60.i = load i32* null, align 4		; <i32> [#uses=1]
-	%tmp65.i = call i32 (i8*, ...)* @printf( i8* getelementptr ([65 x i8]* @.str1, i32 0, i32 0), i32 0, i32 %tmp60.i, i32 %tmp3536.i, i32 %tmp25.i, i32 %tmp25.i, i32 %k.0.reg2mem.0.i, i32 0 ) nounwind 		; <i32> [#uses=0]
-	%tmp70.i = icmp slt i32 %.sum, 8		; <i1> [#uses=1]
-	%k.1.i = select i1 %tmp70.i, i32 %.sum, i32 0		; <i32> [#uses=1]
-	br label %bb21.i
-}
diff --git a/test/CodeGen/X86/byval7.ll b/test/CodeGen/X86/byval7.ll
index fcbc59b838..6b64c6ce4d 100644
--- a/test/CodeGen/X86/byval7.ll
+++ b/test/CodeGen/X86/byval7.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah | grep add | grep 16
+; RUN: llvm-as < %s | llc -march=x86 -mcpu=yonah | egrep {add|lea} | grep 16
 
 	%struct.S = type { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>,
                            <2 x i64> }
diff --git a/test/CodeGen/X86/iv-users-in-other-loops.ll b/test/CodeGen/X86/iv-users-in-other-loops.ll
index b47b856052..b21586173f 100644
--- a/test/CodeGen/X86/iv-users-in-other-loops.ll
+++ b/test/CodeGen/X86/iv-users-in-other-loops.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-as < %s | llc -march=x86-64 -f -o %t
 ; RUN: grep inc %t | count 2
 ; RUN: grep addq %t | count 13
-; RUN: grep leaq %t | count 10
+; RUN: grep leaq %t | count 9
 ; RUN: grep movq %t | count 5
 
 ; IV users in each of the loops from other loops shouldn't cause LSR
diff --git a/test/CodeGen/X86/subreg-to-reg-1.ll b/test/CodeGen/X86/subreg-to-reg-1.ll
index 4e487e1847..cf9f2d8142 100644
--- a/test/CodeGen/X86/subreg-to-reg-1.ll
+++ b/test/CodeGen/X86/subreg-to-reg-1.ll
@@ -1,6 +1,9 @@
-; RUN: llvm-as < %s | llc -march=x86-64 | grep {movl	%e.\*, %e.\*} | count 1
+; RUN: llvm-as < %s | llc -march=x86-64 | grep {leal	.*), %e.\*} | count 1
 
 ; Don't eliminate or coalesce away the explicit zero-extension!
+; This is currently using an leal because of a 3-addressification detail,
+; though this isn't necessary; The point of this test is to make sure
+; a 32-bit add is used.
 
 define i64 @foo(i64 %a) {
   %b = add i64 %a, 4294967295
diff --git a/test/CodeGen/X86/subreg-to-reg-3.ll b/test/CodeGen/X86/subreg-to-reg-3.ll
new file mode 100644
index 0000000000..6634538c2a
--- /dev/null
+++ b/test/CodeGen/X86/subreg-to-reg-3.ll
@@ -0,0 +1,10 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | grep imull
+
+; Don't eliminate or coalesce away the explicit zero-extension!
+
+define i64 @foo(i64 %a) {
+  %b = mul i64 %a, 7823
+  %c = and i64 %b, 4294967295
+  %d = add i64 %c, 1
+  ret i64 %d
+}
diff --git a/test/CodeGen/X86/subreg-to-reg-4.ll b/test/CodeGen/X86/subreg-to-reg-4.ll
new file mode 100644
index 0000000000..bb6af3988c
--- /dev/null
+++ b/test/CodeGen/X86/subreg-to-reg-4.ll
@@ -0,0 +1,135 @@
+; RUN: llvm-as < %s | llc -march=x86-64 > %t
+; RUN: not grep leaq %t
+; RUN: not grep incq %t
+; RUN: not grep decq %t
+; RUN: not grep negq %t
+; RUN: not grep addq %t
+; RUN: not grep subq %t
+; RUN: not grep {movl	%} %t
+
+; Utilize implicit zero-extension on x86-64 to eliminate explicit
+; zero-extensions. Shrink 64-bit adds to 32-bit when the high
+; 32-bits will be zeroed.
+
+define void @bar(i64 %x, i64 %y, i64* %z) nounwind readnone {
+entry:
+	%t0 = add i64 %x, %y
+	%t1 = and i64 %t0, 4294967295
+        store i64 %t1, i64* %z
+	ret void
+}
+define void @easy(i32 %x, i32 %y, i64* %z) nounwind readnone {
+entry:
+	%t0 = add i32 %x, %y
+        %tn = zext i32 %t0 to i64
+	%t1 = and i64 %tn, 4294967295
+        store i64 %t1, i64* %z
+	ret void
+}
+define void @cola(i64 *%x, i64 %y, i64* %z, i64 %u) nounwind readnone {
+entry:
+        %p = load i64* %x
+	%t0 = add i64 %p, %y
+	%t1 = and i64 %t0, 4294967295
+        %t2 = xor i64 %t1, %u
+        store i64 %t2, i64* %z
+	ret void
+}
+define void @yaks(i64 *%x, i64 %y, i64* %z, i64 %u) nounwind readnone {
+entry:
+        %p = load i64* %x
+	%t0 = add i64 %p, %y
+        %t1 = xor i64 %t0, %u
+	%t2 = and i64 %t1, 4294967295
+        store i64 %t2, i64* %z
+	ret void
+}
+define void @foo(i64 *%x, i64 *%y, i64* %z) nounwind readnone {
+entry:
+        %a = load i64* %x
+        %b = load i64* %y
+	%t0 = add i64 %a, %b
+	%t1 = and i64 %t0, 4294967295
+        store i64 %t1, i64* %z
+	ret void
+}
+define void @avo(i64 %x, i64* %z, i64 %u) nounwind readnone {
+entry:
+	%t0 = add i64 %x, 734847
+	%t1 = and i64 %t0, 4294967295
+        %t2 = xor i64 %t1, %u
+        store i64 %t2, i64* %z
+	ret void
+}
+define void @phe(i64 %x, i64* %z, i64 %u) nounwind readnone {
+entry:
+	%t0 = add i64 %x, 734847
+        %t1 = xor i64 %t0, %u
+	%t2 = and i64 %t1, 4294967295
+        store i64 %t2, i64* %z
+	ret void
+}
+define void @oze(i64 %y, i64* %z) nounwind readnone {
+entry:
+	%t0 = add i64 %y, 1
+	%t1 = and i64 %t0, 4294967295
+        store i64 %t1, i64* %z
+	ret void
+}
+
+define void @sbar(i64 %x, i64 %y, i64* %z) nounwind readnone {
+entry:
+	%t0 = sub i64 %x, %y
+	%t1 = and i64 %t0, 4294967295
+        store i64 %t1, i64* %z
+	ret void
+}
+define void @seasy(i32 %x, i32 %y, i64* %z) nounwind readnone {
+entry:
+	%t0 = sub i32 %x, %y
+        %tn = zext i32 %t0 to i64
+	%t1 = and i64 %tn, 4294967295
+        store i64 %t1, i64* %z
+	ret void
+}
+define void @scola(i64 *%x, i64 %y, i64* %z, i64 %u) nounwind readnone {
+entry:
+        %p = load i64* %x
+	%t0 = sub i64 %p, %y
+	%t1 = and i64 %t0, 4294967295
+        %t2 = xor i64 %t1, %u
+        store i64 %t2, i64* %z
+	ret void
+}
+define void @syaks(i64 *%x, i64 %y, i64* %z, i64 %u) nounwind readnone {
+entry:
+        %p = load i64* %x
+	%t0 = sub i64 %p, %y
+        %t1 = xor i64 %t0, %u
+	%t2 = and i64 %t1, 4294967295
+        store i64 %t2, i64* %z
+	ret void
+}
+define void @sfoo(i64 *%x, i64 *%y, i64* %z) nounwind readnone {
+entry:
+        %a = load i64* %x
+        %b = load i64* %y
+	%t0 = sub i64 %a, %b
+	%t1 = and i64 %t0, 4294967295
+        store i64 %t1, i64* %z
+	ret void
+}
+define void @swya(i64 %y, i64* %z) nounwind readnone {
+entry:
+	%t0 = sub i64 0, %y
+	%t1 = and i64 %t0, 4294967295
+        store i64 %t1, i64* %z
+	ret void
+}
+define void @soze(i64 %y, i64* %z) nounwind readnone {
+entry:
+	%t0 = sub i64 %y, 1
+	%t1 = and i64 %t0, 4294967295
+        store i64 %t1, i64* %z
+	ret void
+}
diff --git a/test/CodeGen/X86/subreg-to-reg-5.ll b/test/CodeGen/X86/subreg-to-reg-5.ll
new file mode 100644
index 0000000000..eee751a87d
--- /dev/null
+++ b/test/CodeGen/X86/subreg-to-reg-5.ll
@@ -0,0 +1,34 @@
+; RUN: llvm-as < %s | llc -march=x86-64 > %t
+; RUN: grep addl %t
+; RUN: not egrep {movl|movq} %t
+
+define float @foo(float* %B) nounwind {
+entry:
+	br label %bb2
+
+bb2:		; preds = %bb3, %entry
+	%B_addr.0.rec = phi i64 [ %indvar.next154, %bb3 ], [ 0, %entry ]		; <i64> [#uses=2]
+	br i1 false, label %bb3, label %bb4
+
+bb3:		; preds = %bb2
+	%indvar.next154 = add i64 %B_addr.0.rec, 1		; <i64> [#uses=1]
+	br label %bb2
+
+bb4:		; preds = %bb2
+	%B_addr.0 = getelementptr float* %B, i64 %B_addr.0.rec		; <float*> [#uses=1]
+	%t1 = ptrtoint float* %B_addr.0 to i64		; <i64> [#uses=1]
+	%t2 = and i64 %t1, 15		; <i64> [#uses=1]
+	%t3 = icmp eq i64 %t2, 0		; <i1> [#uses=1]
+	br i1 %t3, label %bb5, label %bb10.preheader
+
+bb10.preheader:		; preds = %bb4
+	br label %bb9
+
+bb5:		; preds = %bb4
+	unreachable
+
+bb9:		; preds = %bb10.preheader
+	%t5 = getelementptr float* %B, i64 0		; <float*> [#uses=1]
+	%t7 = load float* %t5		; <float> [#uses=1]
+	ret float %t7
+}
diff --git a/test/CodeGen/X86/subreg-to-reg-6.ll b/test/CodeGen/X86/subreg-to-reg-6.ll
new file mode 100644
index 0000000000..f18eef7d19
--- /dev/null
+++ b/test/CodeGen/X86/subreg-to-reg-6.ll
@@ -0,0 +1,29 @@
+; RUN: llvm-as < %s | llc -march=x86-64
+
+define i64 @foo() nounwind {
+entry:
+	%t0 = load i32* null, align 8
+	switch i32 %t0, label %bb65 [
+		i32 16, label %bb
+		i32 12, label %bb56
+	]
+
+bb:
+	br label %bb65
+
+bb56:
+	unreachable
+
+bb65:
+	%a = phi i64 [ 0, %bb ], [ 0, %entry ]
+	tail call void asm "", "{cx}"(i64 %a) nounwind
+	%t15 = and i64 %a, 4294967295
+	ret i64 %t15
+}
+
+define i64 @bar(i64 %t0) nounwind {
+	call void asm "", "{cx}"(i64 0) nounwind
+	%t1 = sub i64 0, %t0
+	%t2 = and i64 %t1, 4294967295
+	ret i64 %t2
+}
diff --git a/test/CodeGen/X86/twoaddr-coalesce.ll b/test/CodeGen/X86/twoaddr-coalesce.ll
index ba98ae8252..5293b77879 100644
--- a/test/CodeGen/X86/twoaddr-coalesce.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce.ll
@@ -1,5 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 -stats |& \
-; RUN:   grep {twoaddrinstr} | grep {Number of instructions aggressively commuted}
+; RUN: llvm-as < %s | llc -march=x86 | grep mov | count 5
 ; rdar://6523745
 
 @"\01LC" = internal constant [4 x i8] c"%d\0A\00"		; <[4 x i8]*> [#uses=1]
author	Dan Gohman <gohman@apple.com>	2009-04-08 00:15:30 +0000
committer	Dan Gohman <gohman@apple.com>	2009-04-08 00:15:30 +0000
commit	97121ba2afb8d566ff1bf5c4e8fc5d4077940a7f (patch)
tree	b530d1fd94181f009f5d7ff1d760c88336a67def
parent	a49a671efe24aa6e2e9e2280cf714ef97a40f177 (diff)
download	llvm-97121ba2afb8d566ff1bf5c4e8fc5d4077940a7f.tar.gz llvm-97121ba2afb8d566ff1bf5c4e8fc5d4077940a7f.tar.bz2 llvm-97121ba2afb8d566ff1bf5c4e8fc5d4077940a7f.tar.xz