diff options
-rw-r--r-- | lib/Target/ARM/ARM.td | 24 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrInfo.td | 1 | ||||
-rw-r--r-- | lib/Target/ARM/ARMInstrNEON.td | 20 | ||||
-rw-r--r-- | lib/Target/ARM/ARMSubtarget.cpp | 1 | ||||
-rw-r--r-- | lib/Target/ARM/ARMSubtarget.h | 5 | ||||
-rw-r--r-- | test/CodeGen/ARM/zero-cycle-zero.ll | 70 |
6 files changed, 115 insertions, 6 deletions
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index 27bbcc22b6..7916ccc180 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -73,6 +73,11 @@ def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", "Enable support for CRC instructions">; +// Cyclone has preferred instructions for zeroing VFP registers, which can +// execute in 0 cycles. +def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", + "Has zero-cycle zeroing instructions">; + // Some processors have FP multiply-accumulate instructions that don't // play nicely with other VFP / NEON instructions, and it's generally better // to just not use them. @@ -361,6 +366,13 @@ def : ProcessorModel<"cortex-a15", CortexA9Model, FeatureDSPThumb2, FeatureHasRAS, FeatureAClass]>; +// FIXME: krait has currently the same Schedule model as A9 +def : ProcessorModel<"krait", CortexA9Model, + [ProcKrait, HasV7Ops, + FeatureNEON, FeatureDB, + FeatureDSPThumb2, FeatureHasRAS, + FeatureAClass]>; + // FIXME: R5 has currently the same ProcessorModel as A8. def : ProcessorModel<"cortex-r5", CortexA8Model, [ProcR5, HasV7Ops, FeatureDB, @@ -395,12 +407,12 @@ def : ProcNoItin<"cortex-a57", [ProcA57, HasV8Ops, FeatureAClass, FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2]>; -// FIXME: krait has currently the same Schedule model as A9 -def : ProcessorModel<"krait", CortexA9Model, - [ProcKrait, HasV7Ops, - FeatureNEON, FeatureDB, - FeatureDSPThumb2, FeatureHasRAS, - FeatureAClass]>; +// Cyclone is very similar to swift +def : ProcessorModel<"cyclone", SwiftModel, + [ProcSwift, HasV8Ops, HasV7Ops, + FeatureCrypto, FeatureFPARMv8, + FeatureDB,FeatureDSPThumb2, + FeatureHasRAS, FeatureZCZeroing]>; //===----------------------------------------------------------------------===// // Register File Description diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index cc748e19c8..dfcc11edcd 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -244,6 +244,7 @@ def HasMP : Predicate<"Subtarget->hasMPExtension()">, def HasTrustZone : Predicate<"Subtarget->hasTrustZone()">, AssemblerPredicate<"FeatureTrustZone", "TrustZone">; +def HasZCZ : Predicate<"Subtarget->hasZeroCycleZeroing()">; def UseNEONForFP : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">; def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">; def IsThumb : Predicate<"Subtarget->isThumb()">, diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index b18eac55d8..0d46c49bcf 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -5245,6 +5245,26 @@ def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd), [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>; } // isReMaterializable + +// On some CPUs the two instructions "vmov.i32 dD, #0" and "vmov.i32 qD, #0" +// require zero cycles to execute so they should be used wherever possible for +// setting a register to zero. + +// Even without these pseudo-insts we would probably end up with the correct +// instruction, but we could not mark the general ones with "isAsCheapAsAMove" +// since they are sometimes rather expensive (in general). + +let AddedComplexity = 50, isAsCheapAsAMove = 1, isReMaterializable = 1 in { + def VMOVD0 : ARMPseudoExpand<(outs DPR:$Vd), (ins), 4, IIC_VMOVImm, + [(set DPR:$Vd, (v2i32 NEONimmAllZerosV))], + (VMOVv2i32 DPR:$Vd, 0, (ops 14, zero_reg))>, + Requires<[HasZCZ]>; + def VMOVQ0 : ARMPseudoExpand<(outs QPR:$Vd), (ins), 4, IIC_VMOVImm, + [(set QPR:$Vd, (v4i32 NEONimmAllZerosV))], + (VMOVv4i32 QPR:$Vd, 0, (ops 14, zero_reg))>, + Requires<[HasZCZ]>; +} + // VMOV : Vector Get Lane (move scalar to ARM core register) def VGETLNs8 : NVGetLane<{1,1,1,0,0,1,?,1}, 0b1011, {?,?}, diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp index a290136f6b..0dec1c406a 100644 --- a/lib/Target/ARM/ARMSubtarget.cpp +++ b/lib/Target/ARM/ARMSubtarget.cpp @@ -134,6 +134,7 @@ void ARMSubtarget::initializeEnvironment() { HasTrustZone = false; HasCrypto = false; HasCRC = false; + HasZeroCycleZeroing = false; AllowsUnalignedMem = false; Thumb2DSP = false; UseNaClTrap = false; diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h index 2ce99c890f..e76cc85a1a 100644 --- a/lib/Target/ARM/ARMSubtarget.h +++ b/lib/Target/ARM/ARMSubtarget.h @@ -177,6 +177,10 @@ protected: /// HasCRC - if true, processor supports CRC instructions bool HasCRC; + /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are + /// particularly effective at zeroing a VFP register. + bool HasZeroCycleZeroing; + /// AllowsUnalignedMem - If true, the subtarget allows unaligned memory /// accesses for some types. For details, see /// ARMTargetLowering::allowsUnalignedMemoryAccesses(). @@ -298,6 +302,7 @@ public: bool isFPOnlySP() const { return FPOnlySP; } bool hasPerfMon() const { return HasPerfMon; } bool hasTrustZone() const { return HasTrustZone; } + bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; } bool prefers32BitThumb() const { return Pref32BitThumb; } bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; } bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; } diff --git a/test/CodeGen/ARM/zero-cycle-zero.ll b/test/CodeGen/ARM/zero-cycle-zero.ll new file mode 100644 index 0000000000..121a87f5b8 --- /dev/null +++ b/test/CodeGen/ARM/zero-cycle-zero.ll @@ -0,0 +1,70 @@ +; RUN: llc -mtriple=armv8 -mcpu=cyclone < %s | FileCheck %s --check-prefix=CHECK-CYCLONE +; RUN: llc -mtriple=armv8 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-SWIFT + +declare arm_aapcs_vfpcc void @take_vec64(<2 x i32>) + +define void @test_vec64() { +; CHECK-CYCLONE-LABEL: test_vec64: +; CHECK-SWIFT-LABEL: test_vec64: + + call arm_aapcs_vfpcc void @take_vec64(<2 x i32> <i32 0, i32 0>) + call arm_aapcs_vfpcc void @take_vec64(<2 x i32> <i32 0, i32 0>) +; CHECK-CYCLONE-NOT: vmov.f64 d0, +; CHECK-CYCLONE: vmov.i32 d0, #0 +; CHECK-CYCLONE: bl +; CHECK-CYCLONE: vmov.i32 d0, #0 +; CHECK-CYCLONE: bl + +; CHECK-SWIFT: vmov.f64 [[ZEROREG:d[0-9]+]], +; CHECK-SWIFT: vmov.i32 [[ZEROREG]], #0 +; CHECK-SWIFT: vorr d0, [[ZEROREG]], [[ZEROREG]] +; CHECK-SWIFT: bl +; CHECK-SWIFT: vorr d0, [[ZEROREG]], [[ZEROREG]] +; CHECK-SWIFT: bl + + ret void +} + +declare arm_aapcs_vfpcc void @take_vec128(<8 x i16>) + +define void @test_vec128() { +; CHECK-CYCLONE-LABEL: test_vec128: +; CHECK-SWIFT-LABEL: test_vec128: + + call arm_aapcs_vfpcc void @take_vec128(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>) + call arm_aapcs_vfpcc void @take_vec128(<8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>) +; CHECK-CYCLONE-NOT: vmov.f64 [[ZEROREG:d[0-9]+]], +; CHECK-CYCLONE: vmov.i32 q0, #0 +; CHECK-CYCLONE: bl +; CHECK-CYCLONE: vmov.i32 q0, #0 +; CHECK-CYCLONE: bl + +; CHECK-SWIFT-NOT: vmov.f64 [[ZEROREG:d[0-9]+]], +; CHECK-SWIFT: vmov.i32 [[ZEROREG:q[0-9]+]], #0 +; CHECK-SWIFT: vorr q0, [[ZEROREG]], [[ZEROREG]] +; CHECK-SWIFT: bl +; CHECK-SWIFT: vorr q0, [[ZEROREG]], [[ZEROREG]] +; CHECK-SWIFT: bl + + ret void +} + +declare void @take_i32(i32) + +define void @test_i32() { +; CHECK-CYCLONE-LABEL: test_i32: +; CHECK-SWIFT-LABEL: test_i32: + + call arm_aapcs_vfpcc void @take_i32(i32 0) + call arm_aapcs_vfpcc void @take_i32(i32 0) +; CHECK-CYCLONE-NOT: vmov.f64 [[ZEROREG:d[0-9]+]], +; CHECK-CYCLONE: mov r0, #0 +; CHECK-CYCLONE: bl +; CHECK-CYCLONE: mov r0, #0 +; CHECK-CYCLONE: bl + +; It doesn't particularly matter what Swift does here, there isn't carefully +; crafted behaviour that we might break in Cyclone. + + ret void +} |