summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJim Grosbach <grosbach@apple.com>2010-07-17 03:30:54 +0000
committerJim Grosbach <grosbach@apple.com>2010-07-17 03:30:54 +0000
commit5423856e44a7e4b173af211b0fb0675c44945a58 (patch)
treebac56b6c0151d1328703beccdcf5e0b34618d6a6
parent879259faa3b24015949f3a3614ce348a7c20e422 (diff)
downloadllvm-5423856e44a7e4b173af211b0fb0675c44945a58.tar.gz
llvm-5423856e44a7e4b173af211b0fb0675c44945a58.tar.bz2
llvm-5423856e44a7e4b173af211b0fb0675c44945a58.tar.xz
Add combiner patterns to more effectively utilize the BFI (bitfield insert)
instruction for non-constant operands. This includes the case referenced in the README.txt regarding a bitfield copy. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@108608 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp84
-rw-r--r--lib/Target/ARM/README.txt21
-rw-r--r--test/CodeGen/ARM/bfi.ll23
-rw-r--r--test/CodeGen/Thumb2/bfi.ll23
4 files changed, 114 insertions, 37 deletions
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 2e5ba3567e..2cba4bf0ca 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -4240,21 +4240,33 @@ static SDValue PerformMULCombine(SDNode *N,
static SDValue PerformORCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
+ // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
+ // reasonable.
+
// BFI is only available on V6T2+
if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
return SDValue();
SelectionDAG &DAG = DCI.DAG;
SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
- // or (and A, mask), val => ARMbfi A, val, mask
- // iff (val & mask) == val
- if (N0->getOpcode() != ISD::AND)
+ DebugLoc DL = N->getDebugLoc();
+ // 1) or (and A, mask), val => ARMbfi A, val, mask
+ // iff (val & mask) == val
+ //
+ // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+ // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
+ // && CountPopulation_32(mask) == CountPopulation_32(~mask2)
+ // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
+ // && CountPopulation_32(mask) == CountPopulation_32(~mask2)
+ // (i.e., copy a bitfield value into another bitfield of the same width)
+ if (N0.getOpcode() != ISD::AND)
return SDValue();
EVT VT = N->getValueType(0);
if (VT != MVT::i32)
return SDValue();
+
// The value and the mask need to be constants so we can verify this is
// actually a bitfield set. If the mask is 0xffff, we can do better
// via a movt instruction, so don't use BFI in that case.
@@ -4264,21 +4276,61 @@ static SDValue PerformORCombine(SDNode *N,
unsigned Mask = C->getZExtValue();
if (Mask == 0xffff)
return SDValue();
- C = dyn_cast<ConstantSDNode>(N1);
- if (!C)
- return SDValue();
- unsigned Val = C->getZExtValue();
- if (!ARM::isBitFieldInvertedMask(Mask) || (Val & ~Mask) != Val)
- return SDValue();
- Val >>= CountTrailingZeros_32(~Mask);
+ SDValue Res;
+ // Case (1): or (and A, mask), val => ARMbfi A, val, mask
+ if ((C = dyn_cast<ConstantSDNode>(N1))) {
+ unsigned Val = C->getZExtValue();
+ if (!ARM::isBitFieldInvertedMask(Mask) || (Val & ~Mask) != Val)
+ return SDValue();
+ Val >>= CountTrailingZeros_32(~Mask);
- DebugLoc DL = N->getDebugLoc();
- SDValue Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0),
- DAG.getConstant(Val, MVT::i32),
- DAG.getConstant(Mask, MVT::i32));
+ Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0),
+ DAG.getConstant(Val, MVT::i32),
+ DAG.getConstant(Mask, MVT::i32));
- // Do not add new nodes to DAG combiner worklist.
- DCI.CombineTo(N, Res, false);
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, Res, false);
+ } else if (N1.getOpcode() == ISD::AND) {
+ // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
+ C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+ if (!C)
+ return SDValue();
+ unsigned Mask2 = C->getZExtValue();
+
+ if (ARM::isBitFieldInvertedMask(Mask) &&
+ ARM::isBitFieldInvertedMask(~Mask2) &&
+ (CountPopulation_32(Mask) == CountPopulation_32(~Mask2))) {
+ // The pack halfword instruction works better for masks that fit it,
+ // so use that when it's available.
+ if (Subtarget->hasT2ExtractPack() &&
+ (Mask == 0xffff || Mask == 0xffff0000))
+ return SDValue();
+ // 2a
+ unsigned lsb = CountTrailingZeros_32(Mask2);
+ Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
+ DAG.getConstant(lsb, MVT::i32));
+ Res = DAG.getNode(ARMISD::BFI, DL, VT, N0.getOperand(0), Res,
+ DAG.getConstant(Mask, MVT::i32));
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, Res, false);
+ } else if (ARM::isBitFieldInvertedMask(~Mask) &&
+ ARM::isBitFieldInvertedMask(Mask2) &&
+ (CountPopulation_32(~Mask) == CountPopulation_32(Mask2))) {
+ // The pack halfword instruction works better for masks that fit it,
+ // so use that when it's available.
+ if (Subtarget->hasT2ExtractPack() &&
+ (Mask2 == 0xffff || Mask2 == 0xffff0000))
+ return SDValue();
+ // 2b
+ unsigned lsb = CountTrailingZeros_32(Mask);
+ Res = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
+ DAG.getConstant(lsb, MVT::i32));
+ Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
+ DAG.getConstant(Mask2, MVT::i32));
+ // Do not add new nodes to DAG combiner worklist.
+ DCI.CombineTo(N, Res, false);
+ }
+ }
return SDValue();
}
diff --git a/lib/Target/ARM/README.txt b/lib/Target/ARM/README.txt
index 0cb8ff0118..ba4e5da5a0 100644
--- a/lib/Target/ARM/README.txt
+++ b/lib/Target/ARM/README.txt
@@ -611,27 +611,6 @@ constant which was already loaded). Not sure what's necessary to do that.
//===---------------------------------------------------------------------===//
-Given the following on ARMv7:
-int test1(int A, int B) {
- return (A&-8388481)|(B&8388480);
-}
-
-We currently generate:
- bfc r0, #7, #16
- movw r2, #:lower16:8388480
- movt r2, #:upper16:8388480
- and r1, r1, r2
- orr r0, r1, r0
- bx lr
-
-The following is much shorter:
- lsr r1, r1, #7
- bfi r0, r1, #7, #16
- bx lr
-
-
-//===---------------------------------------------------------------------===//
-
The code generated for bswap on armv4/5 (CPUs without rev) is less than ideal:
int a(int x) { return __builtin_bswap32(x); }
diff --git a/test/CodeGen/ARM/bfi.ll b/test/CodeGen/ARM/bfi.ll
index 48ef43762f..59e2b43a91 100644
--- a/test/CodeGen/ARM/bfi.ll
+++ b/test/CodeGen/ARM/bfi.ll
@@ -15,3 +15,26 @@ entry:
store i32 %2, i32* bitcast (%struct.F* @X to i32*), align 4
ret void
}
+
+define i32 @f2(i32 %A, i32 %B) nounwind readnone optsize {
+entry:
+; CHECK: f2
+; CHECK: mov r1, r1, lsr #7
+; CHECK: bfi r0, r1, #7, #16
+ %and = and i32 %A, -8388481 ; <i32> [#uses=1]
+ %and2 = and i32 %B, 8388480 ; <i32> [#uses=1]
+ %or = or i32 %and2, %and ; <i32> [#uses=1]
+ ret i32 %or
+}
+
+define i32 @f3(i32 %A, i32 %B) nounwind readnone optsize {
+entry:
+; CHECK: f3
+; CHECK: mov r2, r0, lsr #7
+; CHECK: mov r0, r1
+; CHECK: bfi r0, r2, #7, #16
+ %and = and i32 %A, 8388480 ; <i32> [#uses=1]
+ %and2 = and i32 %B, -8388481 ; <i32> [#uses=1]
+ %or = or i32 %and2, %and ; <i32> [#uses=1]
+ ret i32 %or
+}
diff --git a/test/CodeGen/Thumb2/bfi.ll b/test/CodeGen/Thumb2/bfi.ll
index a256d67800..22473bb35a 100644
--- a/test/CodeGen/Thumb2/bfi.ll
+++ b/test/CodeGen/Thumb2/bfi.ll
@@ -15,3 +15,26 @@ entry:
store i32 %2, i32* bitcast (%struct.F* @X to i32*), align 4
ret void
}
+
+define i32 @f2(i32 %A, i32 %B) nounwind readnone optsize {
+entry:
+; CHECK: f2
+; CHECK: lsrs r1, r1, #7
+; CHECK: bfi r0, r1, #7, #16
+ %and = and i32 %A, -8388481 ; <i32> [#uses=1]
+ %and2 = and i32 %B, 8388480 ; <i32> [#uses=1]
+ %or = or i32 %and2, %and ; <i32> [#uses=1]
+ ret i32 %or
+}
+
+define i32 @f3(i32 %A, i32 %B) nounwind readnone optsize {
+entry:
+; CHECK: f3
+; CHECK: lsrs r2, r0, #7
+; CHECK: mov r0, r1
+; CHECK: bfi r0, r2, #7, #16
+ %and = and i32 %A, 8388480 ; <i32> [#uses=1]
+ %and2 = and i32 %B, -8388481 ; <i32> [#uses=1]
+ %or = or i32 %and2, %and ; <i32> [#uses=1]
+ ret i32 %or
+}