summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/Target/R600/SIISelLowering.cpp27
-rw-r--r--test/CodeGen/R600/llvm.SI.sample-masked.ll93
2 files changed, 114 insertions, 6 deletions
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index fe08717a33..b37d5b282b 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -1092,7 +1092,9 @@ static unsigned SubIdx2Lane(unsigned Idx) {
void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
SelectionDAG &DAG) const {
SDNode *Users[4] = { };
- unsigned Writemask = 0, Lane = 0;
+ unsigned Lane = 0;
+ unsigned OldDmask = Node->getConstantOperandVal(0);
+ unsigned NewDmask = 0;
// Try to figure out the used register components
for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
@@ -1103,29 +1105,42 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
return;
+ // Lane means which subreg of %VGPRa_VGPRb_VGPRc_VGPRd is used.
+ // Note that subregs are packed, i.e. Lane==0 is the first bit set
+ // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
+ // set, etc.
Lane = SubIdx2Lane(I->getConstantOperandVal(1));
+ // Set which texture component corresponds to the lane.
+ unsigned Comp;
+ for (unsigned i = 0, Dmask = OldDmask; i <= Lane; i++) {
+ assert(Dmask);
+ Comp = ffs(Dmask)-1;
+ Dmask &= ~(1 << Comp);
+ }
+
// Abort if we have more than one user per component
if (Users[Lane])
return;
Users[Lane] = *I;
- Writemask |= 1 << Lane;
+ NewDmask |= 1 << Comp;
}
- // Abort if all components are used
- if (Writemask == 0xf)
+ // Abort if there's no change
+ if (NewDmask == OldDmask)
return;
// Adjust the writemask in the node
std::vector<SDValue> Ops;
- Ops.push_back(DAG.getTargetConstant(Writemask, MVT::i32));
+ Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
Ops.push_back(Node->getOperand(i));
Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
// If we only got one lane, replace it with a copy
- if (Writemask == (1U << Lane)) {
+ // (if NewDmask has only one bit set...)
+ if (NewDmask && (NewDmask & (NewDmask-1)) == 0) {
SDValue RC = DAG.getTargetConstant(AMDGPU::VReg_32RegClassID, MVT::i32);
SDNode *Copy = DAG.getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
SDLoc(), Users[Lane]->getValueType(0),
diff --git a/test/CodeGen/R600/llvm.SI.sample-masked.ll b/test/CodeGen/R600/llvm.SI.sample-masked.ll
new file mode 100644
index 0000000000..454e48b228
--- /dev/null
+++ b/test/CodeGen/R600/llvm.SI.sample-masked.ll
@@ -0,0 +1,93 @@
+;RUN: llc < %s -march=r600 -mcpu=verde | FileCheck %s
+
+; CHECK-LABEL: @v1
+; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}, 13
+define void @v1(i32 %a1) {
+entry:
+ %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+ %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %2 = extractelement <4 x float> %1, i32 0
+ %3 = extractelement <4 x float> %1, i32 2
+ %4 = extractelement <4 x float> %1, i32 3
+ call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+ ret void
+}
+
+; CHECK-LABEL: @v2
+; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}, 11
+define void @v2(i32 %a1) {
+entry:
+ %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+ %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %2 = extractelement <4 x float> %1, i32 0
+ %3 = extractelement <4 x float> %1, i32 1
+ %4 = extractelement <4 x float> %1, i32 3
+ call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+ ret void
+}
+
+; CHECK-LABEL: @v3
+; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}, 14
+define void @v3(i32 %a1) {
+entry:
+ %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+ %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %2 = extractelement <4 x float> %1, i32 1
+ %3 = extractelement <4 x float> %1, i32 2
+ %4 = extractelement <4 x float> %1, i32 3
+ call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+ ret void
+}
+
+; CHECK-LABEL: @v4
+; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}_VGPR{{[0-9]}}, 7
+define void @v4(i32 %a1) {
+entry:
+ %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+ %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %2 = extractelement <4 x float> %1, i32 0
+ %3 = extractelement <4 x float> %1, i32 1
+ %4 = extractelement <4 x float> %1, i32 2
+ call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
+ ret void
+}
+
+; CHECK-LABEL: @v5
+; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}, 10
+define void @v5(i32 %a1) {
+entry:
+ %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+ %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %2 = extractelement <4 x float> %1, i32 1
+ %3 = extractelement <4 x float> %1, i32 3
+ call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+ ret void
+}
+
+; CHECK-LABEL: @v6
+; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}, 6
+define void @v6(i32 %a1) {
+entry:
+ %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+ %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %2 = extractelement <4 x float> %1, i32 1
+ %3 = extractelement <4 x float> %1, i32 2
+ call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+ ret void
+}
+
+; CHECK-LABEL: @v7
+; CHECK: IMAGE_SAMPLE VGPR{{[[0-9]}}_VGPR{{[0-9]}}, 9
+define void @v7(i32 %a1) {
+entry:
+ %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
+ %1 = call <4 x float> @llvm.SI.sample.v1i32(<1 x i32> %0, <32 x i8> undef, <16 x i8> undef, i32 0)
+ %2 = extractelement <4 x float> %1, i32 0
+ %3 = extractelement <4 x float> %1, i32 3
+ call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
+ ret void
+}
+
+declare <4 x float> @llvm.SI.sample.v1i32(<1 x i32>, <32 x i8>, <16 x i8>, i32) readnone
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)