summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp6
-rw-r--r--lib/Target/X86/X86InstrShiftRotate.td55
-rw-r--r--test/CodeGen/X86/phys_subreg_coalesce-3.ll2
-rw-r--r--test/CodeGen/X86/shift-bmi2.ll178
-rw-r--r--test/CodeGen/X86/targetLoweringGeneric.ll2
5 files changed, 241 insertions, 2 deletions
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 2fb2ed2104..95d2ad4a24 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -565,6 +565,12 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
// BMI/BMI2 foldable instructions
{ X86::RORX32ri, X86::RORX32mi, 0 },
{ X86::RORX64ri, X86::RORX64mi, 0 },
+ { X86::SARX32rr, X86::SARX32rm, 0 },
+ { X86::SARX64rr, X86::SARX64rm, 0 },
+ { X86::SHRX32rr, X86::SHRX32rm, 0 },
+ { X86::SHRX64rr, X86::SHRX64rm, 0 },
+ { X86::SHLX32rr, X86::SHLX32rm, 0 },
+ { X86::SHLX64rr, X86::SHLX64rm, 0 },
};
for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index fe7d0ecf89..893488c159 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -896,4 +896,59 @@ let Predicates = [HasBMI2] in {
(RORX32mi addr:$src, (ROT32L2R_imm8 imm:$shamt))>;
def : Pat<(rotl (loadi64 addr:$src), (i8 imm:$shamt)),
(RORX64mi addr:$src, (ROT64L2R_imm8 imm:$shamt))>;
+
+ // Prefer SARX/SHRX/SHLX over SAR/SHR/SHL with variable shift BUT not
+ // immedidate shift, i.e. the following code is considered better
+ //
+ // mov %edi, %esi
+ // shl $imm, %esi
+ // ... %edi, ...
+ //
+ // than
+ //
+ // movb $imm, %sil
+ // shlx %sil, %edi, %esi
+ // ... %edi, ...
+ //
+ let AddedComplexity = 1 in {
+ def : Pat<(sra GR32:$src1, GR8:$src2),
+ (SARX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra GR64:$src1, GR8:$src2),
+ (SARX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(srl GR32:$src1, GR8:$src2),
+ (SHRX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(srl GR64:$src1, GR8:$src2),
+ (SHRX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(shl GR32:$src1, GR8:$src2),
+ (SHLX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(shl GR64:$src1, GR8:$src2),
+ (SHLX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ }
+
+ // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor
+ //
+ // mov (%ecx), %esi
+ // shl $imm, $esi
+ //
+ // over
+ //
+ // movb $imm %al
+ // shlx %al, (%ecx), %esi
+ //
+ // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole
+ // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible.
}
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 51320dd6d0..2a20e7ad6f 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
; rdar://5571034
; This requires physreg joining, %vreg13 is live everywhere:
diff --git a/test/CodeGen/X86/shift-bmi2.ll b/test/CodeGen/X86/shift-bmi2.ll
new file mode 100644
index 0000000000..d1f321f177
--- /dev/null
+++ b/test/CodeGen/X86/shift-bmi2.ll
@@ -0,0 +1,178 @@
+; RUN: llc -mtriple=i386-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI2 %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=core-avx2 < %s | FileCheck --check-prefix=BMI264 %s
+
+define i32 @shl32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+ %shl = shl i32 %x, %shamt
+; BMI2: shl32
+; BMI2: shlxl
+; BMI2: ret
+; BMI264: shl32
+; BMI264: shlxl
+; BMI264: ret
+ ret i32 %shl
+}
+
+define i32 @shl32i(i32 %x) nounwind uwtable readnone {
+entry:
+ %shl = shl i32 %x, 5
+; BMI2: shl32i
+; BMI2-NOT: shlxl
+; BMI2: ret
+; BMI264: shl32i
+; BMI264-NOT: shlxl
+; BMI264: ret
+ ret i32 %shl
+}
+
+define i32 @shl32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+ %x = load i32* %p
+ %shl = shl i32 %x, %shamt
+; BMI2: shl32p
+; BMI2: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: shl32p
+; BMI264: shlxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+ ret i32 %shl
+}
+
+define i32 @shl32pi(i32* %p) nounwind uwtable readnone {
+entry:
+ %x = load i32* %p
+ %shl = shl i32 %x, 5
+; BMI2: shl32pi
+; BMI2-NOT: shlxl
+; BMI2: ret
+; BMI264: shl32pi
+; BMI264-NOT: shlxl
+; BMI264: ret
+ ret i32 %shl
+}
+
+define i64 @shl64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+ %shl = shl i64 %x, %shamt
+; BMI264: shl64
+; BMI264: shlxq
+; BMI264: ret
+ ret i64 %shl
+}
+
+define i64 @shl64i(i64 %x) nounwind uwtable readnone {
+entry:
+ %shl = shl i64 %x, 7
+; BMI264: shl64i
+; BMI264-NOT: shlxq
+; BMI264: ret
+ ret i64 %shl
+}
+
+define i64 @shl64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+ %x = load i64* %p
+ %shl = shl i64 %x, %shamt
+; BMI264: shl64p
+; BMI264: shlxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+ ret i64 %shl
+}
+
+define i64 @shl64pi(i64* %p) nounwind uwtable readnone {
+entry:
+ %x = load i64* %p
+ %shl = shl i64 %x, 7
+; BMI264: shl64p
+; BMI264-NOT: shlxq
+; BMI264: ret
+ ret i64 %shl
+}
+
+define i32 @lshr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+ %shl = lshr i32 %x, %shamt
+; BMI2: lshr32
+; BMI2: shrxl
+; BMI2: ret
+; BMI264: lshr32
+; BMI264: shrxl
+; BMI264: ret
+ ret i32 %shl
+}
+
+define i32 @lshr32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+ %x = load i32* %p
+ %shl = lshr i32 %x, %shamt
+; BMI2: lshr32p
+; BMI2: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: lshr32
+; BMI264: shrxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+ ret i32 %shl
+}
+
+define i64 @lshr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+ %shl = lshr i64 %x, %shamt
+; BMI264: lshr64
+; BMI264: shrxq
+; BMI264: ret
+ ret i64 %shl
+}
+
+define i64 @lshr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+ %x = load i64* %p
+ %shl = lshr i64 %x, %shamt
+; BMI264: lshr64p
+; BMI264: shrxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+ ret i64 %shl
+}
+
+define i32 @ashr32(i32 %x, i32 %shamt) nounwind uwtable readnone {
+entry:
+ %shl = ashr i32 %x, %shamt
+; BMI2: ashr32
+; BMI2: sarxl
+; BMI2: ret
+; BMI264: ashr32
+; BMI264: sarxl
+; BMI264: ret
+ ret i32 %shl
+}
+
+define i32 @ashr32p(i32* %p, i32 %shamt) nounwind uwtable readnone {
+entry:
+ %x = load i32* %p
+ %shl = ashr i32 %x, %shamt
+; BMI2: ashr32p
+; BMI2: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI2: ret
+; BMI264: ashr32
+; BMI264: sarxl %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+ ret i32 %shl
+}
+
+define i64 @ashr64(i64 %x, i64 %shamt) nounwind uwtable readnone {
+entry:
+ %shl = ashr i64 %x, %shamt
+; BMI264: ashr64
+; BMI264: sarxq
+; BMI264: ret
+ ret i64 %shl
+}
+
+define i64 @ashr64p(i64* %p, i64 %shamt) nounwind uwtable readnone {
+entry:
+ %x = load i64* %p
+ %shl = ashr i64 %x, %shamt
+; BMI264: ashr64p
+; BMI264: sarxq %{{.+}}, ({{.+}}), %{{.+}}
+; BMI264: ret
+ ret i64 %shl
+}
diff --git a/test/CodeGen/X86/targetLoweringGeneric.ll b/test/CodeGen/X86/targetLoweringGeneric.ll
index ba5f8f8361..a773e9daef 100644
--- a/test/CodeGen/X86/targetLoweringGeneric.ll
+++ b/test/CodeGen/X86/targetLoweringGeneric.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i386-apple-darwin9 -fast-isel=false -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=i386-apple-darwin9 -mcpu=corei7 -fast-isel=false -O0 < %s | FileCheck %s
; Gather non-machine specific tests for the transformations in
; CodeGen/SelectionDAG/TargetLowering. Currently, these