2 files changed, 124 insertions, 2 deletions
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 726af46965..d1e3f1afbf 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -1017,14 +1017,18 @@ bool InlineSpiller::foldMemoryOperand(MachineBasicBlock::iterator MI,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       MachineInstr *LoadMI) {
   bool WasCopy = MI->isCopy();
+  unsigned ImpReg = 0;
+
   // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
   // operands.
   SmallVector<unsigned, 8> FoldOps;
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
     unsigned Idx = Ops[i];
     MachineOperand &MO = MI->getOperand(Idx);
-    if (MO.isImplicit())
+    if (MO.isImplicit()) {
+      ImpReg = MO.getReg();
       continue;
+    }
     // FIXME: Teach targets to deal with subregs.
     if (MO.getSubReg())
       return false;
@@ -1045,7 +1049,20 @@ bool InlineSpiller::foldMemoryOperand(MachineBasicBlock::iterator MI,
   if (!LoadMI)
     VRM.addSpillSlotUse(StackSlot, FoldMI);
   MI->eraseFromParent();
-  DEBUG(dbgs() << "\tfolded: " << *FoldMI);
+
+  // TII.foldMemoryOperand may have left some implicit operands on the
+  // instruction.  Strip them.
+  if (ImpReg)
+    for (unsigned i = FoldMI->getNumOperands(); i; --i) {
+      MachineOperand &MO = FoldMI->getOperand(i - 1);
+      if (!MO.isReg() || !MO.isImplicit())
+        break;
+      if (MO.getReg() == ImpReg)
+        FoldMI->RemoveOperand(i - 1);
+    }
+
+  DEBUG(dbgs() << "\tfolded:  " << LIS.getInstructionIndex(FoldMI) << '\t'
+               << *FoldMI);
   if (!WasCopy)
     ++NumFolded;
   else if (Ops.front() == 0)
diff --git a/test/CodeGen/X86/2011-11-09-FoldImpDefs.ll b/test/CodeGen/X86/2011-11-09-FoldImpDefs.ll
new file mode 100644
index 0000000000..095d8c68c2
--- /dev/null
+++ b/test/CodeGen/X86/2011-11-09-FoldImpDefs.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s -verify-regalloc | FileCheck %s
+; PR11347
+;
+; This test case materializes the constant 1 in a register:
+;
+; %vreg19<def> = MOV32ri 1
+;
+; Then rematerializes the instruction for a sub-register copy:
+; 1168L   %vreg14:sub_8bit<def,undef> = COPY %vreg19:sub_8bit<kill>, %vreg14<imp-def>; GR32:%vreg14,%vreg19
+;        Considering merging %vreg19 with %vreg14
+;                RHS = %vreg19 = [560d,656L:0)[720L,976d:0)[1088L,1168d:0)  0@560d
+;                LHS = %vreg14 = [16d,160L:0)[160L,256L:2)[256L,1088L:1)[1168d,1184L:3)[1184L,1344L:2)  0@16d-phikill 1@256L-phidef-phikill 2@1184L-phidef-phikill 3@1168d-phikill
+; Remat: %vreg14<def> = MOV32ri 1, %vreg14<imp-def>, %vreg14<imp-def>; GR32:%vreg14
+;
+; This rematerialized constant is feeding a PHI that is spilled, so the constant
+; is written directly to a stack slot that gets the %esi function argument in
+; another basic block:
+;
+; CHECK: %entry
+; CHECK: movl %esi, [[FI:[0-9]+\(%rsp\)]]
+; CHECK: %if.else24
+; CHECK: movl $1, [[FI]]
+; CHECK: %lor.end9
+; CHECK: movl [[FI]],
+;
+; Those <imp-def> operands on the MOV32ri instruction confused the spiller
+; because they were preserved by TII.foldMemoryOperand.  It is quite rare to
+; see a rematerialized instruction spill, it can only happen when it is feeding
+; a PHI.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7"
+
+@g_193 = external global i32, align 4
+@g_103 = external global i32, align 4
+
+declare i32 @func_21(i16 signext, i32) nounwind uwtable readnone ssp
+
+define i32 @func_25(i32 %p_27, i8 signext %p_28, i32 %p_30) noreturn nounwind uwtable ssp {
+entry:
+  br label %for.cond
+
+for.cond28.for.cond.loopexit_crit_edge:           ; preds = %for.cond28thread-pre-split
+  store i32 0, i32* @g_103, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond28thread-pre-split, %for.cond28.for.cond.loopexit_crit_edge, %entry
+  %l_365.0 = phi i32 [ undef, %entry ], [ %and, %for.cond28.for.cond.loopexit_crit_edge ], [ %and, %for.cond28thread-pre-split ]
+  %l_288.0 = phi i32 [ undef, %entry ], [ %l_288.1.ph, %for.cond28.for.cond.loopexit_crit_edge ], [ %l_288.1.ph, %for.cond28thread-pre-split ]
+  %l_349.0 = phi i32 [ undef, %entry ], [ %xor, %for.cond28.for.cond.loopexit_crit_edge ], [ %xor, %for.cond28thread-pre-split ]
+  %p_28.addr.0 = phi i8 [ %p_28, %entry ], [ %p_28.addr.1.ph, %for.cond28.for.cond.loopexit_crit_edge ], [ %p_28.addr.1.ph, %for.cond28thread-pre-split ]
+  br i1 undef, label %for.cond31, label %lor.end
+
+lor.end:                                          ; preds = %for.cond
+  %tobool3 = icmp eq i32 %l_349.0, 0
+  br i1 %tobool3, label %for.cond31, label %if.then
+
+if.then:                                          ; preds = %lor.end
+  br i1 undef, label %lor.rhs6, label %lor.end9
+
+lor.rhs6:                                         ; preds = %if.then
+  br label %lor.end9
+
+lor.end9:                                         ; preds = %lor.rhs6, %if.then
+  %and = and i32 %l_365.0, 1
+  %conv11 = sext i8 %p_28.addr.0 to i32
+  %xor = xor i32 %and, %conv11
+  br i1 false, label %if.else, label %if.end
+
+if.else:                                          ; preds = %lor.end9
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %lor.end9
+  %l_395.0 = phi i32 [ 0, %if.else ], [ 1, %lor.end9 ]
+  %cmp14 = icmp ne i32 %and, %conv11
+  %conv15 = zext i1 %cmp14 to i32
+  br i1 %cmp14, label %if.then16, label %for.cond28thread-pre-split
+
+if.then16:                                        ; preds = %if.end
+  %or17 = or i32 %l_288.0, 1
+  %call18 = tail call i32 @func_39(i32 0, i32 %or17, i32 0, i32 0) nounwind
+  br i1 undef, label %if.else24, label %if.then20
+
+if.then20:                                        ; preds = %if.then16
+  %conv21 = trunc i32 %l_395.0 to i16
+  %call22 = tail call i32 @func_21(i16 signext %conv21, i32 undef)
+  br label %for.cond28thread-pre-split
+
+if.else24:                                        ; preds = %if.then16
+  store i32 %conv15, i32* @g_193, align 4
+  %conv25 = trunc i32 %l_395.0 to i8
+  br label %for.cond28thread-pre-split
+
+for.cond28thread-pre-split:                       ; preds = %if.else24, %if.then20, %if.end
+  %l_288.1.ph = phi i32 [ %l_288.0, %if.end ], [ %or17, %if.else24 ], [ %or17, %if.then20 ]
+  %p_28.addr.1.ph = phi i8 [ %p_28.addr.0, %if.end ], [ %conv25, %if.else24 ], [ %p_28.addr.0, %if.then20 ]
+  %.pr = load i32* @g_103, align 4
+  %tobool2933 = icmp eq i32 %.pr, 0
+  br i1 %tobool2933, label %for.cond, label %for.cond28.for.cond.loopexit_crit_edge
+
+for.cond31:                                       ; preds = %for.cond31, %lor.end, %for.cond
+  br label %for.cond31
+}
+
+declare i32 @func_39(i32, i32, i32, i32)