R600/SI: Try to use scalar BFE.

Use scalar BFE with constant shift and offset when possible. This is complicated by the fact that the scalar version packs the two operands of the vector version into one. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@206558 91177308-0d34-0410-b5e6-96231b3b80d8
author: Matt Arsenault <Matthew.Arsenault@amd.com> 2014-04-18 05:19:26 +0000
committer: Matt Arsenault <Matthew.Arsenault@amd.com> 2014-04-18 05:19:26 +0000
commit: 746734df1accfeb5f336de211655c304d58cb7b3 (patch)
tree: a8dd5416388f375301241808d8c2833c4b4d2e82
parent: 532a5ffe4c264030d6e350641947fe69fe7babad (diff)
download: llvm-746734df1accfeb5f336de211655c304d58cb7b3.tar.gz
llvm-746734df1accfeb5f336de211655c304d58cb7b3.tar.bz2
llvm-746734df1accfeb5f336de211655c304d58cb7b3.tar.xz
3 files changed, 106 insertions, 16 deletions
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index 3c41638b7c..5132d45378 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -203,13 +203,14 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     N->setNodeId(-1);
     return NULL;   // Already selected.
   }
+
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
   switch (Opc) {
   default: break;
   // We are selecting i64 ADD here instead of custom lower it during
   // DAG legalization, so we can fold some i64 ADDs used for address
   // calculation into the LOAD and STORE instructions.
   case ISD::ADD: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (N->getValueType(0) != MVT::i64 ||
         ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
       break;
@@ -255,7 +256,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   }
   case ISD::BUILD_VECTOR: {
     unsigned RegClassID;
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     const AMDGPURegisterInfo *TRI =
                    static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
     const SIRegisterInfo *SIRI =
@@ -342,7 +342,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   }
   case ISD::BUILD_PAIR: {
     SDValue RC, SubReg0, SubReg1;
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
       break;
     }
@@ -393,7 +392,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   }
 
   case AMDGPUISD::REGISTER_LOAD: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
     SDValue Addr, Offset;
@@ -410,7 +408,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                   Ops);
   }
   case AMDGPUISD::REGISTER_STORE: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
     SDValue Addr, Offset;
@@ -426,6 +423,47 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                         CurDAG->getVTList(MVT::Other),
                                         Ops);
   }
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    // There is a scalar version available, but unlike the vector version which
+    // has a separate operand for the offset and width, the scalar version packs
+    // the width and offset into a single operand. Try to move to the scalar
+    // version if the offsets are constant, so that we can try to keep extended
+    // loads of kernel arguments in SGPRs.
+
+    // TODO: Technically we could try to pattern match scalar bitshifts of
+    // dynamic values, but it's probably not useful.
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    bool Signed = Opc == AMDGPUISD::BFE_I32;
+
+    // Transformation function, pack the offset and width of a BFE into
+    // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+    // source, bits [5:0] contain the offset and bits [22:16] the width.
+
+    uint32_t OffsetVal = Offset->getZExtValue();
+    uint32_t WidthVal = Width->getZExtValue();
+
+    uint32_t PackedVal = OffsetVal | WidthVal << 16;
+
+    SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32);
+    return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
+                                  SDLoc(N),
+                                  MVT::i32,
+                                  N->getOperand(0),
+                                  PackedOffsetWidth);
+
+  }
   }
   return SelectCode(N);
 }
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index 21f7a81700..96eeea5627 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -539,6 +539,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
   case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
   case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
+  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
   case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
   case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
   case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
@@ -1009,6 +1011,27 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
 
     addDescImplicitUseDef(NewDesc, Inst);
 
+    if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+      const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
+      // If we need to move this to VGPRs, we need to unpack the second operand
+      // back into the 2 separate ones for bit offset and width.
+      assert(OffsetWidthOp.isImm() &&
+             "Scalar BFE is only implemented for constant width and offset");
+      uint32_t Imm = OffsetWidthOp.getImm();
+
+      uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+      Inst->RemoveOperand(2); // Remove old immediate.
+      Inst->addOperand(MachineOperand::CreateImm(Offset));
+      Inst->addOperand(MachineOperand::CreateImm(BitWidth));
+
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+    }
+
     // Update the destination register class.
 
     const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
diff --git a/test/CodeGen/R600/sext-in-reg.ll b/test/CodeGen/R600/sext-in-reg.ll
index a47da2b25a..b722959aad 100644
--- a/test/CodeGen/R600/sext-in-reg.ll
+++ b/test/CodeGen/R600/sext-in-reg.ll
@@ -1,12 +1,13 @@
-; RUN: llc < %s -march=r600 -mcpu=SI | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc < %s -march=r600 -mcpu=cypress | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.AMDGPU.imax(i32, i32) nounwind readnone
 
 
 ; FUNC-LABEL: @sext_in_reg_i1_i32
 ; SI: S_LOAD_DWORD [[ARG:s[0-9]+]],
-; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], [[ARG]], 0, 1
+; SI: S_BFE_I32 [[SEXTRACT:s[0-9]+]], [[ARG]], 0x10000
+; SI: V_MOV_B32_e32 [[EXTRACT:v[0-9]+]], [[SEXTRACT]]
 ; SI: BUFFER_STORE_DWORD [[EXTRACT]],
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
@@ -148,8 +149,8 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) noun
 
 ; This is broken on Evergreen for some reason related to the <1 x i64> kernel arguments.
 ; XFUNC-LABEL: @sext_in_reg_i8_to_v1i64
-; XSI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 8
-; XSI: V_ASHRREV_I32_e32 {{v[0-9]+}}, 31,
+; XSI: S_BFE_I32 [[EXTRACT:s[0-9]+]], {{s[0-9]+}}, 524288
+; XSI: S_ASHR_I32 {{v[0-9]+}}, [[EXTRACT]], 31
 ; XSI: BUFFER_STORE_DWORD
 ; XEG: BFE_INT
 ; XEG: ASHR
@@ -204,8 +205,8 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out
 
 
 ; FUNC-LABEL: @sext_in_reg_v2i1_to_v2i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
 ; SI: BUFFER_STORE_DWORDX2
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
@@ -221,10 +222,10 @@ define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 }
 
 ; FUNC-LABEL: @sext_in_reg_v4i1_to_v4i32
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
-; SI: V_BFE_I32 {{v[0-9]+}}, {{s[0-9]+}}, 0, 1
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
+; SI: S_BFE_I32 {{s[0-9]+}}, {{s[0-9]+}}, 0x10000
 ; SI: BUFFER_STORE_DWORDX4
 
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+]]{{\.[XYZW][XYZW][XYZW][XYZW]}}, [[ADDR:T[0-9]+.[XYZW]]]
@@ -320,6 +321,34 @@ define void @testcase_3(i8 addrspace(1)* %out, i8 %a) nounwind {
   ret void
 }
 
+; FUNC-LABEL: @vgpr_sext_in_reg_v4i8_to_v4i32
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
+define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
+  %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
+; FUNC-LABEL: @vgpr_sext_in_reg_v4i16_to_v4i32
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+; SI: V_BFE_I32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
+define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) nounwind {
+  %loada = load <4 x i32> addrspace(1)* %a, align 16
+  %loadb = load <4 x i32> addrspace(1)* %b, align 16
+  %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
+  %shl = shl <4 x i32> %c, <i32 16, i32 16, i32 16, i32 16>
+  %ashr = ashr <4 x i32> %shl, <i32 16, i32 16, i32 16, i32 16>
+  store <4 x i32> %ashr, <4 x i32> addrspace(1)* %out, align 8
+  ret void
+}
+
 ; FIXME: The BFE should really be eliminated. I think it should happen
 ; when computeMaskedBitsForTargetNode is implemented for imax.
author	Matt Arsenault <Matthew.Arsenault@amd.com>	2014-04-18 05:19:26 +0000
committer	Matt Arsenault <Matthew.Arsenault@amd.com>	2014-04-18 05:19:26 +0000
commit	746734df1accfeb5f336de211655c304d58cb7b3 (patch)
tree	a8dd5416388f375301241808d8c2833c4b4d2e82
parent	532a5ffe4c264030d6e350641947fe69fe7babad (diff)
download	llvm-746734df1accfeb5f336de211655c304d58cb7b3.tar.gz llvm-746734df1accfeb5f336de211655c304d58cb7b3.tar.bz2 llvm-746734df1accfeb5f336de211655c304d58cb7b3.tar.xz