diff options
author | Weiming Zhao <weimingz@codeaurora.org> | 2014-01-15 01:32:12 +0000 |
---|---|---|
committer | Weiming Zhao <weimingz@codeaurora.org> | 2014-01-15 01:32:12 +0000 |
commit | 2a0c41756bfb697f4c028b84501b31b1320786d1 (patch) | |
tree | 44dfa356755cae4ea60667f995339e5f08e891f3 | |
parent | 0be7e6ffb1683c700d620376f318382e7a237cb2 (diff) | |
download | llvm-2a0c41756bfb697f4c028b84501b31b1320786d1.tar.gz llvm-2a0c41756bfb697f4c028b84501b31b1320786d1.tar.bz2 llvm-2a0c41756bfb697f4c028b84501b31b1320786d1.tar.xz |
PR 18466: Fix ARM Pseudo Expansion
When expanding neon pseudo stores, it may miss the implicit uses of sub
regs, which may cause post RA scheduler reorder instructions that
breakes anti dependency.
For example:
VST1d64QPseudo %R0<kill>, 16, %Q9_Q10, pred:14, pred:%noreg
will be expanded to
VST1d64Q %R0<kill>, 16, %D18, pred:14, pred:%noreg;
An instruction that defines %D20 may be scheduled before the store by
mistake.
This patches adds implicit uses for such case. For the example above, it
emits:
VST1d64Q %R0<kill>, 8, %D18, pred:14, pred:%noreg, %Q9_Q10<imp-use>
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@199282 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/ARM/ARMExpandPseudoInsts.cpp | 6 | ||||
-rw-r--r-- | test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll | 55 |
2 files changed, 59 insertions, 2 deletions
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp index f695a8e4e5..b9594e6128 100644 --- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp +++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp @@ -479,6 +479,8 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) { if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg. MIB->addRegisterKilled(SrcReg, TRI, true); + else if (!SrcIsUndef) + MIB.addReg(SrcReg, RegState::Implicit); // Add implicit uses for src reg. TransferImpOps(MI, MIB, MIB); // Transfer memoperands. @@ -604,8 +606,8 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI, MIB.addOperand(MI.getOperand(OpIdx++)); MIB.addOperand(MI.getOperand(OpIdx++)); - if (SrcIsKill) // Add an implicit kill for the super-reg. - MIB->addRegisterKilled(SrcReg, TRI, true); + // Add an implicit kill and use for the super-reg. + MIB.addReg(SrcReg, RegState::Implicit | getKillRegState(SrcIsKill)); TransferImpOps(MI, MIB, MIB); MI.eraseFromParent(); } diff --git a/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll b/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll new file mode 100644 index 0000000000..60f361e7cb --- /dev/null +++ b/test/CodeGen/ARM/2014-01-09-pseudo_expand_implicit_reg.ll @@ -0,0 +1,55 @@ +; RUN: llc < %s -march=arm -mattr=+neon -print-before=post-RA-sched > %t 2>&1 && FileCheck < %t %s + +define void @vst(i8* %m, [4 x i64] %v) { +entry: +; CHECK: vst: +; CHECK: VST1d64Q %R{{[0-9]+}}<kill>, 8, %D{{[0-9]+}}, pred:14, pred:%noreg, %Q{{[0-9]+}}_Q{{[0-9]+}}<imp-use> + + %v0 = extractvalue [4 x i64] %v, 0 + %v1 = extractvalue [4 x i64] %v, 1 + %v2 = extractvalue [4 x i64] %v, 2 + %v3 = extractvalue [4 x i64] %v, 3 + + %t0 = bitcast i64 %v0 to <8 x i8> + %t1 = bitcast i64 %v1 to <8 x i8> + %t2 = bitcast i64 %v2 to <8 x i8> + %t3 = bitcast i64 %v3 to <8 x i8> + + %s0 = bitcast <8 x i8> %t0 to <1 x i64> + %s1 = bitcast <8 x i8> %t1 to <1 x i64> + %s2 = bitcast <8 x i8> %t2 to <1 x i64> + %s3 = bitcast <8 x i8> %t3 to <1 x i64> + + %tmp0 = bitcast <1 x i64> %s2 to i64 + %tmp1 = bitcast <1 x i64> %s3 to i64 + + %n0 = insertelement <2 x i64> undef, i64 %tmp0, i32 0 + %n1 = insertelement <2 x i64> %n0, i64 %tmp1, i32 1 + + call void @llvm.arm.neon.vst4.v1i64(i8* %m, <1 x i64> %s0, <1 x i64> %s1, <1 x i64> %s2, <1 x i64> %s3, i32 8) + + call void @bar(<2 x i64> %n1) + + ret void +} + +%struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } +define <8 x i8> @vtbx4(<8 x i8>* %A, %struct.__neon_int8x8x4_t* %B, <8 x i8>* %C) nounwind { +; CHECK: vtbx4: +; CHECK: VTBX4 {{.*}}, pred:14, pred:%noreg, %Q{{[0-9]+}}_Q{{[0-9]+}}<imp-use> + %tmp1 = load <8 x i8>* %A + %tmp2 = load %struct.__neon_int8x8x4_t* %B + %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 0 + %tmp4 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 1 + %tmp5 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 2 + %tmp6 = extractvalue %struct.__neon_int8x8x4_t %tmp2, 3 + %tmp7 = load <8 x i8>* %C + %tmp8 = call <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8> %tmp1, <8 x i8> %tmp3, <8 x i8> %tmp4, <8 x i8> %tmp5, <8 x i8> %tmp6, <8 x i8> %tmp7) + call void @bar2(%struct.__neon_int8x8x4_t %tmp2, <8 x i8> %tmp8) + ret <8 x i8> %tmp8 +} + +declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) +declare <8 x i8> @llvm.arm.neon.vtbx4(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone +declare void @bar2(%struct.__neon_int8x8x4_t, <8 x i8>) +declare void @bar(<2 x i64> %arg) |