From b32cee560d91bb2ec4325ad348ee322a42a3a3cf Mon Sep 17 00:00:00 2001 From: James Molloy Date: Fri, 16 May 2014 14:24:22 +0000 Subject: Re-enable inline memcpy expansion for Thumb1. Patch by Moritz Roth! git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@208994 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMSelectionDAGInfo.cpp | 7 +++--- lib/Target/ARM/ARMSubtarget.h | 4 +--- test/CodeGen/ARM/memcpy-inline.ll | 28 ++++++++++++++++++---- test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll | 37 ++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 11 deletions(-) create mode 100644 test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp index cb5812f6d2..7ce988773c 100644 --- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -53,9 +53,10 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, EVT VT = MVT::i32; unsigned VTSize = 4; unsigned i = 0; - const unsigned MAX_LOADS_IN_LDM = 6; - SDValue TFOps[MAX_LOADS_IN_LDM]; - SDValue Loads[MAX_LOADS_IN_LDM]; + // Emit a maximum of 4 loads in Thumb1 since we have fewer registers + const unsigned MAX_LOADS_IN_LDM = Subtarget->isThumb1Only() ? 4 : 6; + SDValue TFOps[6]; + SDValue Loads[6]; uint64_t SrcOff = 0, DstOff = 0; // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 77e4719342..fd106f6fc8 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -239,9 +239,7 @@ protected: /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size /// that still makes it profitable to inline the call. unsigned getMaxInlineSizeThreshold() const { - // FIXME: For now, we don't lower memcpy's to loads / stores for Thumb1. - // Change this once Thumb1 ldmia / stmia support is added. - return isThumb1Only() ? 0 : 64; + return 64; } /// ParseSubtargetFeatures - Parses features string setting specified /// subtarget options. Definition of function is auto generated by tblgen. diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll index 14d84deea8..84ce4a7f0e 100644 --- a/test/CodeGen/ARM/memcpy-inline.ll +++ b/test/CodeGen/ARM/memcpy-inline.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 -pre-RA-sched=source -disable-post-ra | FileCheck %s - +; RUN: llc < %s -mtriple=thumbv6m-apple-ios -mcpu=cortex-m0 -pre-RA-sched=source -disable-post-ra | FileCheck %s -check-prefix=CHECK-T1 %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } @src = external global %struct.x @@ -17,7 +17,12 @@ define i32 @t0() { entry: ; CHECK-LABEL: t0: ; CHECK: vldr [[REG1:d[0-9]+]], -; CHECK: vstr [[REG1]], +; CHECK: vstr [[REG1]], +; CHECK-T1-LABEL: t0: +; CHECK-T1: ldrb [[TREG1:r[0-9]]], +; CHECK-T1: strb [[TREG1]], +; CHECK-T1: ldrh [[TREG2:r[0-9]]], +; CHECK-T1: strh [[TREG2]] call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false) ret i32 0 } @@ -83,6 +88,11 @@ entry: ; CHECK: movw [[REG7:r[0-9]+]], #18500 ; CHECK: movt [[REG7:r[0-9]+]], #22866 ; CHECK: str [[REG7]] +; CHECK-T1-LABEL: t5: +; CHECK-T1: movs [[TREG3:r[0-9]]], +; CHECK-T1: strb [[TREG3]], +; CHECK-T1: movs [[TREG4:r[0-9]]], +; CHECK-T1: strb [[TREG4]], tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false) ret void } @@ -90,12 +100,17 @@ entry: define void @t6() nounwind { entry: ; CHECK-LABEL: t6: -; CHECK: vld1.8 {[[REG8:d[0-9]+]]}, [r0] -; CHECK: vstr [[REG8]], [r1] +; CHECK: vld1.8 {[[REG9:d[0-9]+]]}, [r0] +; CHECK: vstr [[REG9]], [r1] ; CHECK: adds r1, #6 ; CHECK: adds r0, #6 ; CHECK: vld1.8 ; CHECK: vst1.16 +; CHECK-T1-LABEL: t6: +; CHECK-T1: movs [[TREG5:r[0-9]]], +; CHECK-T1: strh [[TREG5]], +; CHECK-T1: ldr [[TREG6:r[0-9]]], +; CHECK-T1: str [[TREG6]] call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false) ret void } @@ -104,9 +119,12 @@ entry: define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind { entry: -; CHECK: t7 +; CHECK-LABEL: t7: ; CHECK: vld1.32 ; CHECK: vst1.32 +; CHECK-T1-LABEL: t7: +; CHECK-T1: ldr +; CHECK-T1: str %0 = bitcast %struct.Foo* %a to i8* %1 = bitcast %struct.Foo* %b to i8* tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false) diff --git a/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll new file mode 100644 index 0000000000..06cfd9bbef --- /dev/null +++ b/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll @@ -0,0 +1,37 @@ +; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s + +@d = external global [64 x i32] +@s = external global [64 x i32] + +; Function Attrs: nounwind +define void @t1() #0 { +entry: +; CHECK: ldr [[REG0:r[0-9]]], +; CHECK: ldm [[REG0]]!, +; CHECK: ldr [[REG1:r[0-9]]], +; CHECK: stm [[REG1]]!, +; CHECK: subs [[REG0]], #32 +; CHECK-NEXT: ldrb +; CHECK: subs [[REG1]], #32 +; CHECK-NEXT: strb + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 33, i32 4, i1 false) + ret void +} + +; Function Attrs: nounwind +define void @t2() #0 { +entry: +; CHECK: ldr [[REG0:r[0-9]]], +; CHECK: ldm [[REG0]]!, +; CHECK: ldr [[REG1:r[0-9]]], +; CHECK: stm [[REG1]]!, +; CHECK: ldrh +; CHECK: ldrb +; CHECK: strb +; CHECK: strh + tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* bitcast ([64 x i32]* @s to i8*), i8* bitcast ([64 x i32]* @d to i8*), i32 15, i32 4, i1 false) + ret void +} + +; Function Attrs: nounwind +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1 -- cgit v1.2.3