diff options
author | Michael Liao <michael.liao@intel.com> | 2013-03-25 23:50:10 +0000 |
---|---|---|
committer | Michael Liao <michael.liao@intel.com> | 2013-03-25 23:50:10 +0000 |
commit | d4584c9e5658887ec50c43760c988d04eaa13e34 (patch) | |
tree | 1f64d66547a6aed3e1b1ee10531534229ad01f18 | |
parent | b4f98ea1213c866f39aa5b341ec0116f9c2335d7 (diff) | |
download | llvm-d4584c9e5658887ec50c43760c988d04eaa13e34.tar.gz llvm-d4584c9e5658887ec50c43760c988d04eaa13e34.tar.bz2 llvm-d4584c9e5658887ec50c43760c988d04eaa13e34.tar.xz |
Revise alignment checking/calculation on 256-bit unaligned memory access
- It's still considered aligned when the specified alignment is larger
than the natural alignment;
- The new alignment for the high 128-bit vector should be min(16,
alignment) as the pointer is advanced by 16, a power-of-2 offset.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@177947 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 13 | ||||
-rw-r--r-- | test/CodeGen/X86/avx-load-store.ll | 24 |
2 files changed, 29 insertions, 8 deletions
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 23cfd6d72f..fef2b9659b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -16639,11 +16639,10 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); unsigned RegSz = RegVT.getSizeInBits(); + // On Sandybridge unaligned 256bit loads are inefficient. ISD::LoadExtType Ext = Ld->getExtensionType(); unsigned Alignment = Ld->getAlignment(); - bool IsAligned = Alignment == 0 || Alignment == MemVT.getSizeInBits()/8; - - // On Sandybridge unaligned 256bit loads are inefficient. + bool IsAligned = Alignment == 0 || Alignment >= MemVT.getSizeInBits()/8; if (RegVT.is256BitVector() && !Subtarget->hasInt256() && !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) { unsigned NumElems = RegVT.getVectorNumElements(); @@ -16663,7 +16662,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG, SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(), Ld->isVolatile(), Ld->isNonTemporal(), Ld->isInvariant(), - std::max(Alignment/2U, 1U)); + std::min(16U, Alignment)); SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Load1.getValue(1), Load2.getValue(1)); @@ -16834,13 +16833,13 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, DebugLoc dl = St->getDebugLoc(); SDValue StoredVal = St->getOperand(1); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - unsigned Alignment = St->getAlignment(); - bool IsAligned = Alignment == 0 || Alignment == VT.getSizeInBits()/8; // If we are saving a concatenation of two XMM registers, perform two stores. // On Sandy Bridge, 256-bit memory operations are executed by two // 128-bit ports. However, on Haswell it is better to issue a single 256-bit // memory operation. + unsigned Alignment = St->getAlignment(); + bool IsAligned = Alignment == 0 || Alignment >= VT.getSizeInBits()/8; if (VT.is256BitVector() && !Subtarget->hasInt256() && StVT == VT && !IsAligned) { unsigned NumElems = VT.getVectorNumElements(); @@ -16860,7 +16859,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG, SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(), St->isVolatile(), St->isNonTemporal(), - std::max(Alignment/2U, 1U)); + std::min(16U, Alignment)); return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); } diff --git a/test/CodeGen/X86/avx-load-store.ll b/test/CodeGen/X86/avx-load-store.ll index 0afaff830d..a6775aba09 100644 --- a/test/CodeGen/X86/avx-load-store.ll +++ b/test/CodeGen/X86/avx-load-store.ll @@ -81,7 +81,7 @@ define void @storev32i8_01(<32 x i8> %a) nounwind { ; CHECK: _double_save ; CHECK-NOT: vinsertf128 $1 ; CHECK-NOT: vinsertf128 $0 -; CHECK: vmovups %xmm +; CHECK: vmovaps %xmm ; CHECK: vmovaps %xmm define void @double_save(<4 x i32> %A, <4 x i32> %B, <8 x i32>* %P) nounwind ssp { entry: @@ -127,3 +127,25 @@ define void @add8i32(<8 x i32>* %ret, <8 x i32>* %bp) nounwind { store <8 x i32> %x, <8 x i32>* %ret, align 1 ret void } + +; CHECK: add4i64a64 +; CHECK: vmovaps ({{.*}}), %ymm{{.*}} +; CHECK: vmovaps %ymm{{.*}}, ({{.*}}) +define void @add4i64a64(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { + %b = load <4 x i64>* %bp, align 64 + %x = add <4 x i64> zeroinitializer, %b + store <4 x i64> %x, <4 x i64>* %ret, align 64 + ret void +} + +; CHECK: add4i64a16 +; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}} +; CHECK: vmovaps {{.*}}({{.*}}), %xmm{{.*}} +; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}}) +; CHECK: vmovaps %xmm{{.*}}, {{.*}}({{.*}}) +define void @add4i64a16(<4 x i64>* %ret, <4 x i64>* %bp) nounwind { + %b = load <4 x i64>* %bp, align 16 + %x = add <4 x i64> zeroinitializer, %b + store <4 x i64> %x, <4 x i64>* %ret, align 16 + ret void +} |