diff options
author | Anton Korobeynikov <asl@math.spbu.ru> | 2010-04-07 18:20:42 +0000 |
---|---|---|
committer | Anton Korobeynikov <asl@math.spbu.ru> | 2010-04-07 18:20:42 +0000 |
commit | 0a3e2b591c0f5ae5d31aeb2eb794b3c5c38ffa98 (patch) | |
tree | 884c859c9623067980692e2b2be58a4fd525ad31 /lib/Target | |
parent | fc2b08438c485e3c4c9c0328e38d8abb4687076d (diff) | |
download | llvm-0a3e2b591c0f5ae5d31aeb2eb794b3c5c38ffa98.tar.gz llvm-0a3e2b591c0f5ae5d31aeb2eb794b3c5c38ffa98.tar.bz2 llvm-0a3e2b591c0f5ae5d31aeb2eb794b3c5c38ffa98.tar.xz |
Fix itins for VABA
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@100657 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target')
-rw-r--r-- | lib/Target/ARM/ARMInstrNEON.td | 55 | ||||
-rw-r--r-- | lib/Target/ARM/ARMSchedule.td | 2 | ||||
-rw-r--r-- | lib/Target/ARM/ARMScheduleV7.td | 42 |
3 files changed, 76 insertions, 23 deletions
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td index ff095fc7a1..d4d963b514 100644 --- a/lib/Target/ARM/ARMInstrNEON.td +++ b/lib/Target/ARM/ARMInstrNEON.td @@ -1711,21 +1711,22 @@ multiclass N3VMulOpSL_HS<bits<4> op11_8, // Neon 3-argument intrinsics, // element sizes of 8, 16 and 32 bits: multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itinD, InstrItinClass itinQ, string OpcodeStr, string Dt, Intrinsic IntOp> { // 64-bit vector types. - def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16D, + def v8i8 : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD, OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>; - def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D, + def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, itinD, OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>; - def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32D, + def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, itinD, OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>; // 128-bit vector types. - def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16Q, + def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, itinQ, OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>; - def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16Q, + def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, itinQ, OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>; - def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi32Q, + def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, itinQ, OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>; } @@ -1734,10 +1735,11 @@ multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, // First with only element sizes of 16 and 32 bits: multiclass N3VLInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, Intrinsic IntOp> { - def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, IIC_VMACi16D, + def v4i32 : N3VLInt3<op24, op23, 0b01, op11_8, op4, itin, OpcodeStr, !strconcat(Dt, "16"), v4i32, v4i16, IntOp>; - def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, IIC_VMACi16D, + def v2i64 : N3VLInt3<op24, op23, 0b10, op11_8, op4, itin, OpcodeStr, !strconcat(Dt, "32"), v2i64, v2i32, IntOp>; } @@ -1751,9 +1753,10 @@ multiclass N3VLInt3SL_HS<bit op24, bits<4> op11_8, // ....then also with element size of 8 bits: multiclass N3VLInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4, + InstrItinClass itin, string OpcodeStr, string Dt, Intrinsic IntOp> - : N3VLInt3_HS<op24, op23, op11_8, op4, OpcodeStr, Dt, IntOp> { - def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, IIC_VMACi16D, + : N3VLInt3_HS<op24, op23, op11_8, op4, itin, OpcodeStr, Dt, IntOp> { + def v8i16 : N3VLInt3<op24, op23, 0b00, op11_8, op4, itin, OpcodeStr, !strconcat(Dt, "8"), v8i16, v8i8, IntOp>; } @@ -2177,15 +2180,17 @@ def : Pat<(v4f32 (fadd (v4f32 QPR:$src1), (SubReg_i32_lane imm:$lane)))>; // VMLAL : Vector Multiply Accumulate Long (Q += D * D) -defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, "vmlal", "s", int_arm_neon_vmlals>; -defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, "vmlal", "u", int_arm_neon_vmlalu>; +defm VMLALs : N3VLInt3_QHS<0,1,0b1000,0, IIC_VMACi16D, + "vmlal", "s", int_arm_neon_vmlals>; +defm VMLALu : N3VLInt3_QHS<1,1,0b1000,0, IIC_VMACi16D, + "vmlal", "u", int_arm_neon_vmlalu>; defm VMLALsls : N3VLInt3SL_HS<0, 0b0010, "vmlal", "s", int_arm_neon_vmlals>; defm VMLALslu : N3VLInt3SL_HS<1, 0b0010, "vmlal", "u", int_arm_neon_vmlalu>; // VQDMLAL : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D) -defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, "vqdmlal", "s", - int_arm_neon_vqdmlal>; +defm VQDMLAL : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, + "vqdmlal", "s", int_arm_neon_vqdmlal>; defm VQDMLALsl: N3VLInt3SL_HS<0, 0b0011, "vqdmlal", "s", int_arm_neon_vqdmlal>; // VMLS : Vector Multiply Subtract (integer and floating-point) @@ -2227,15 +2232,17 @@ def : Pat<(v4f32 (fsub (v4f32 QPR:$src1), (SubReg_i32_lane imm:$lane)))>; // VMLSL : Vector Multiply Subtract Long (Q -= D * D) -defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, "vmlsl", "s", int_arm_neon_vmlsls>; -defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, "vmlsl", "u", int_arm_neon_vmlslu>; +defm VMLSLs : N3VLInt3_QHS<0,1,0b1010,0, IIC_VMACi16D, + "vmlsl", "s", int_arm_neon_vmlsls>; +defm VMLSLu : N3VLInt3_QHS<1,1,0b1010,0, IIC_VMACi16D, + "vmlsl", "u", int_arm_neon_vmlslu>; defm VMLSLsls : N3VLInt3SL_HS<0, 0b0110, "vmlsl", "s", int_arm_neon_vmlsls>; defm VMLSLslu : N3VLInt3SL_HS<1, 0b0110, "vmlsl", "u", int_arm_neon_vmlslu>; // VQDMLSL : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D) -defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, "vqdmlsl", "s", - int_arm_neon_vqdmlsl>; +defm VQDMLSL : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, + "vqdmlsl", "s", int_arm_neon_vqdmlsl>; defm VQDMLSLsl: N3VLInt3SL_HS<0, 0b111, "vqdmlsl", "s", int_arm_neon_vqdmlsl>; // Vector Subtract Operations. @@ -2464,12 +2471,16 @@ defm VABDLu : N3VLInt_QHS<1,1,0b0111,0, IIC_VSUBi4Q, "vabdl", "u", int_arm_neon_vabdlu, 0>; // VABA : Vector Absolute Difference and Accumulate -defm VABAs : N3VInt3_QHS<0,0,0b0111,1, "vaba", "s", int_arm_neon_vabas>; -defm VABAu : N3VInt3_QHS<1,0,0b0111,1, "vaba", "u", int_arm_neon_vabau>; +defm VABAs : N3VInt3_QHS<0,0,0b0111,1, IIC_VABAD, IIC_VABAQ, + "vaba", "s", int_arm_neon_vabas>; +defm VABAu : N3VInt3_QHS<1,0,0b0111,1, IIC_VABAD, IIC_VABAQ, + "vaba", "u", int_arm_neon_vabau>; // VABAL : Vector Absolute Difference and Accumulate Long (Q += | D - D |) -defm VABALs : N3VLInt3_QHS<0,1,0b0101,0, "vabal", "s", int_arm_neon_vabals>; -defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, "vabal", "u", int_arm_neon_vabalu>; +defm VABALs : N3VLInt3_QHS<0,1,0b0101,0, IIC_VABAD, + "vabal", "s", int_arm_neon_vabals>; +defm VABALu : N3VLInt3_QHS<1,1,0b0101,0, IIC_VABAD, + "vabal", "u", int_arm_neon_vabalu>; // Vector Maximum and Minimum. diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td index c35abdf3a3..beb837d5f7 100644 --- a/lib/Target/ARM/ARMSchedule.td +++ b/lib/Target/ARM/ARMSchedule.td @@ -135,6 +135,8 @@ def IIC_VBINi4D : InstrItinClass; def IIC_VBINi4Q : InstrItinClass; def IIC_VSUBi4D : InstrItinClass; def IIC_VSUBi4Q : InstrItinClass; +def IIC_VABAD : InstrItinClass; +def IIC_VABAQ : InstrItinClass; def IIC_VSHLiD : InstrItinClass; def IIC_VSHLiQ : InstrItinClass; def IIC_VSHLi4D : InstrItinClass; diff --git a/lib/Target/ARM/ARMScheduleV7.td b/lib/Target/ARM/ARMScheduleV7.td index ed900f781e..d313754afb 100644 --- a/lib/Target/ARM/ARMScheduleV7.td +++ b/lib/Target/ARM/ARMScheduleV7.td @@ -522,6 +522,15 @@ def CortexA8Itineraries : ProcessorItineraries<[ InstrItinData<IIC_VPALiQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, InstrStage<2, [FU_NPipe]>], [7, 3, 2, 1]>, // + // Double-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAD, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [6, 3, 2, 1]>, + // + // Quad-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAQ, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [6, 3, 2, 1]>, + + // // Double-register Integer Multiply (.8, .16) InstrItinData<IIC_VMULi16D, [InstrStage<1, [FU_Pipe0, FU_Pipe1]>, InstrStage<1, [FU_NPipe]>], [6, 2, 2]>, @@ -883,7 +892,38 @@ def CortexA9Itineraries : ProcessorItineraries<[ // Extra 3 latency cycle since wbck is 6 cycles InstrStage2<7, [FU_DRegsVFP], 0, Reserved>, InstrStage<1, [FU_Pipe0, FU_Pipe1]>, - InstrStage<1, [FU_NPipe]>], [4, 2, 1]> + InstrStage<1, [FU_NPipe]>], [4, 2, 1]>, + + // + // Double-register Integer Count + InstrItinData<IIC_VCNTiD, [InstrStage2<1, [FU_DRegsN], 0, Required>, + // Extra 3 latency cycle since wbck is 6 cycles + InstrStage2<7, [FU_DRegsVFP], 0, Reserved>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [3, 2, 2]>, + // + // Quad-register Integer Count + // Result written in N3, but that is relative to the last cycle of multicycle, + // so we use 4 for those cases + InstrItinData<IIC_VCNTiQ, [InstrStage2<1, [FU_DRegsN], 0, Required>, + // Extra 3 latency cycle since wbck is 7 cycles + InstrStage2<8, [FU_DRegsVFP], 0, Reserved>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [4, 2, 2]>, + // + // Double-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAD, [InstrStage2<1, [FU_DRegsN], 0, Required>, + // Extra 3 latency cycle since wbck is 6 cycles + InstrStage2<7, [FU_DRegsVFP], 0, Reserved>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<1, [FU_NPipe]>], [6, 3, 2, 1]>, + // + // Quad-register Absolute Difference and Accumulate + InstrItinData<IIC_VABAQ, [InstrStage2<1, [FU_DRegsN], 0, Required>, + // Extra 3 latency cycle since wbck is 6 cycles + InstrStage2<7, [FU_DRegsVFP], 0, Reserved>, + InstrStage<1, [FU_Pipe0, FU_Pipe1]>, + InstrStage<2, [FU_NPipe]>], [6, 3, 2, 1]> ]>; |