diff options
Diffstat (limited to 'test/CodeGen')
23 files changed, 6098 insertions, 1 deletions
diff --git a/test/CodeGen/AArch64/complex-copy-noneon.ll b/test/CodeGen/AArch64/complex-copy-noneon.ll new file mode 100644 index 0000000000..4ae547856e --- /dev/null +++ b/test/CodeGen/AArch64/complex-copy-noneon.ll @@ -0,0 +1,21 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon < %s + +; The DAG combiner decided to use a vector load/store for this struct copy +; previously. This probably shouldn't happen without NEON, but the most +; important thing is that it compiles. + +define void @store_combine() nounwind { + %src = alloca { double, double }, align 8 + %dst = alloca { double, double }, align 8 + + %src.realp = getelementptr inbounds { double, double }* %src, i32 0, i32 0 + %src.real = load double* %src.realp + %src.imagp = getelementptr inbounds { double, double }* %src, i32 0, i32 1 + %src.imag = load double* %src.imagp + + %dst.realp = getelementptr inbounds { double, double }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { double, double }* %dst, i32 0, i32 1 + store double %src.real, double* %dst.realp + store double %src.imag, double* %dst.imagp + ret void +} diff --git a/test/CodeGen/AArch64/inline-asm-constraints.ll b/test/CodeGen/AArch64/inline-asm-constraints.ll index cfa06a4e0b..18a3b37b41 100644 --- a/test/CodeGen/AArch64/inline-asm-constraints.ll +++ b/test/CodeGen/AArch64/inline-asm-constraints.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s +;RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s define i64 @test_inline_constraint_r(i64 %base, i32 %offset) { ; CHECK-LABEL: test_inline_constraint_r: @@ -44,6 +44,26 @@ define i32 @test_inline_constraint_Q(i32 *%ptr) { @dump = global fp128 zeroinitializer +define void @test_inline_constraint_w(<8 x i8> %vec64, <4 x float> %vec128, half %hlf, float %flt, double %dbl, fp128 %quad) { +; CHECK: test_inline_constraint_w: + call <8 x i8> asm sideeffect "add $0.8b, $1.8b, $1.8b", "=w,w"(<8 x i8> %vec64) + call <8 x i8> asm sideeffect "fadd $0.4s, $1.4s, $1.4s", "=w,w"(<4 x float> %vec128) +; CHECK: add {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +; CHECK: fadd {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + + ; Arguably semantically dodgy to output "vN", but it's what GCC does + ; so purely for compatibility we want vector registers to be output. + call float asm sideeffect "fcvt ${0:s}, ${1:h}", "=w,w"(half undef) + call float asm sideeffect "fadd $0.2s, $0.2s, $0.2s", "=w,w"(float %flt) + call double asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(double %dbl) + call fp128 asm sideeffect "fadd $0.2d, $0.2d, $0.2d", "=w,w"(fp128 %quad) +; CHECK: fcvt {{s[0-9]+}}, {{h[0-9]+}} +; CHECK: fadd {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +; CHECK: fadd {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + ret void +} + define void @test_inline_constraint_I() { ; CHECK-LABEL: test_inline_constraint_I: call void asm sideeffect "add x0, x0, $0", "I"(i32 0) diff --git a/test/CodeGen/AArch64/neon-aba-abd.ll b/test/CodeGen/AArch64/neon-aba-abd.ll new file mode 100644 index 0000000000..b423666d80 --- /dev/null +++ b/test/CodeGen/AArch64/neon-aba-abd.ll @@ -0,0 +1,226 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uabd_v8i8: + %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uabd v0.8b, v0.8b, v1.8b + ret <8 x i8> %abd +} + +define <8 x i8> @test_uaba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uaba_v8i8: + %abd = call <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) + %aba = add <8 x i8> %lhs, %abd +; CHECK: uaba v0.8b, v0.8b, v1.8b + ret <8 x i8> %aba +} + +define <8 x i8> @test_sabd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sabd_v8i8: + %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sabd v0.8b, v0.8b, v1.8b + ret <8 x i8> %abd +} + +define <8 x i8> @test_saba_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_saba_v8i8: + %abd = call <8 x i8> @llvm.arm.neon.vabds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) + %aba = add <8 x i8> %lhs, %abd +; CHECK: saba v0.8b, v0.8b, v1.8b + ret <8 x i8> %aba +} + +declare <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uabd_v16i8: + %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uabd v0.16b, v0.16b, v1.16b + ret <16 x i8> %abd +} + +define <16 x i8> @test_uaba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uaba_v16i8: + %abd = call <16 x i8> @llvm.arm.neon.vabdu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) + %aba = add <16 x i8> %lhs, %abd +; CHECK: uaba v0.16b, v0.16b, v1.16b + ret <16 x i8> %aba +} + +define <16 x i8> @test_sabd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sabd_v16i8: + %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sabd v0.16b, v0.16b, v1.16b + ret <16 x i8> %abd +} + +define <16 x i8> @test_saba_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_saba_v16i8: + %abd = call <16 x i8> @llvm.arm.neon.vabds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) + %aba = add <16 x i8> %lhs, %abd +; CHECK: saba v0.16b, v0.16b, v1.16b + ret <16 x i8> %aba +} + +declare <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uabd_v4i16: + %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uabd v0.4h, v0.4h, v1.4h + ret <4 x i16> %abd +} + +define <4 x i16> @test_uaba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uaba_v4i16: + %abd = call <4 x i16> @llvm.arm.neon.vabdu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) + %aba = add <4 x i16> %lhs, %abd +; CHECK: uaba v0.4h, v0.4h, v1.4h + ret <4 x i16> %aba +} + +define <4 x i16> @test_sabd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sabd_v4i16: + %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sabd v0.4h, v0.4h, v1.4h + ret <4 x i16> %abd +} + +define <4 x i16> @test_saba_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_saba_v4i16: + %abd = call <4 x i16> @llvm.arm.neon.vabds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) + %aba = add <4 x i16> %lhs, %abd +; CHECK: saba v0.4h, v0.4h, v1.4h + ret <4 x i16> %aba +} + +declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uabd_v8i16: + %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uabd v0.8h, v0.8h, v1.8h + ret <8 x i16> %abd +} + +define <8 x i16> @test_uaba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uaba_v8i16: + %abd = call <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) + %aba = add <8 x i16> %lhs, %abd +; CHECK: uaba v0.8h, v0.8h, v1.8h + ret <8 x i16> %aba +} + +define <8 x i16> @test_sabd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sabd_v8i16: + %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sabd v0.8h, v0.8h, v1.8h + ret <8 x i16> %abd +} + +define <8 x i16> @test_saba_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_saba_v8i16: + %abd = call <8 x i16> @llvm.arm.neon.vabds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) + %aba = add <8 x i16> %lhs, %abd +; CHECK: saba v0.8h, v0.8h, v1.8h + ret <8 x i16> %aba +} + +declare <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uabd_v2i32: + %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uabd v0.2s, v0.2s, v1.2s + ret <2 x i32> %abd +} + +define <2 x i32> @test_uaba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uaba_v2i32: + %abd = call <2 x i32> @llvm.arm.neon.vabdu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + %aba = add <2 x i32> %lhs, %abd +; CHECK: uaba v0.2s, v0.2s, v1.2s + ret <2 x i32> %aba +} + +define <2 x i32> @test_sabd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sabd_v2i32: + %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sabd v0.2s, v0.2s, v1.2s + ret <2 x i32> %abd +} + +define <2 x i32> @test_saba_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_saba_v2i32: + %abd = call <2 x i32> @llvm.arm.neon.vabds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) + %aba = add <2 x i32> %lhs, %abd +; CHECK: saba v0.2s, v0.2s, v1.2s + ret <2 x i32> %aba +} + +declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uabd_v4i32: + %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uabd v0.4s, v0.4s, v1.4s + ret <4 x i32> %abd +} + +define <4 x i32> @test_uaba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uaba_v4i32: + %abd = call <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) + %aba = add <4 x i32> %lhs, %abd +; CHECK: uaba v0.4s, v0.4s, v1.4s + ret <4 x i32> %aba +} + +define <4 x i32> @test_sabd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sabd_v4i32: + %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sabd v0.4s, v0.4s, v1.4s + ret <4 x i32> %abd +} + +define <4 x i32> @test_saba_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_saba_v4i32: + %abd = call <4 x i32> @llvm.arm.neon.vabds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) + %aba = add <4 x i32> %lhs, %abd +; CHECK: saba v0.4s, v0.4s, v1.4s + ret <4 x i32> %aba +} + +declare <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float>, <2 x float>) + +define <2 x float> @test_fabd_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fabd_v2f32: + %abd = call <2 x float> @llvm.arm.neon.vabds.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fabd v0.2s, v0.2s, v1.2s + ret <2 x float> %abd +} + +declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>) + +define <4 x float> @test_fabd_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fabd_v4f32: + %abd = call <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fabd v0.4s, v0.4s, v1.4s + ret <4 x float> %abd +} + +declare <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double>, <2 x double>) + +define <2 x double> @test_fabd_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fabd_v2f64: + %abd = call <2 x double> @llvm.arm.neon.vabds.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fabd v0.2d, v0.2d, v1.2d + ret <2 x double> %abd +}
\ No newline at end of file diff --git a/test/CodeGen/AArch64/neon-add-pairwise.ll b/test/CodeGen/AArch64/neon-add-pairwise.ll new file mode 100644 index 0000000000..1abfed3190 --- /dev/null +++ b/test/CodeGen/AArch64/neon-add-pairwise.ll @@ -0,0 +1,92 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_addp_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: addp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_addp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpadd.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: addp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_addp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: addp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_addp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpadd.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: addp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_addp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: addp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_addp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpadd.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: addp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + + +declare <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_addp_v2i64: + %val = call <2 x i64> @llvm.arm.neon.vpadd.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: addp v0.2d, v0.2d, v1.2d + ret <2 x i64> %val +} + +declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_faddp_v2f32: + %val = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: faddp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_faddp_v4f32: + %val = call <4 x float> @llvm.arm.neon.vpadd.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: faddp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_faddp_v2f64: + %val = call <2 x double> @llvm.arm.neon.vpadd.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: faddp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + diff --git a/test/CodeGen/AArch64/neon-add-sub.ll b/test/CodeGen/AArch64/neon-add-sub.ll new file mode 100644 index 0000000000..65ec8a247e --- /dev/null +++ b/test/CodeGen/AArch64/neon-add-sub.ll @@ -0,0 +1,132 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: add {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp3 = add <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @add16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: add {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp3 = add <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <4 x i16> @add4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: add {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp3 = add <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @add8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: add {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp3 = add <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <2 x i32> @add2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: add {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = add <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @add4x32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: add {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = add <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <2 x i64> @add2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = add <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <2 x float> @add2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fadd {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = fadd <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @add4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fadd {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = fadd <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @add2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: add {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = fadd <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + +define <8 x i8> @sub8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: sub {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp3 = sub <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @sub16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: sub {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp3 = sub <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <4 x i16> @sub4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: sub {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp3 = sub <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @sub8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: sub {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp3 = sub <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <2 x i32> @sub2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: sub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = sub <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @sub4x32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: sub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = sub <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + +define <2 x i64> @sub2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = sub <2 x i64> %A, %B; + ret <2 x i64> %tmp3 +} + +define <2 x float> @sub2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fsub {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = fsub <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @sub4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fsub {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = fsub <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @sub2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: sub {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = fsub <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + +define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) { +;CHECK: add {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + %tmp3 = add <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + +define <1 x i64> @sub1xi64(<1 x i64> %A, <1 x i64> %B) { +;CHECK: sub {{d[0-31]+}}, {{d[0-31]+}}, {{d[0-31]+}} + %tmp3 = sub <1 x i64> %A, %B; + ret <1 x i64> %tmp3 +} + diff --git a/test/CodeGen/AArch64/neon-bitcast.ll b/test/CodeGen/AArch64/neon-bitcast.ll new file mode 100644 index 0000000000..f9ec704840 --- /dev/null +++ b/test/CodeGen/AArch64/neon-bitcast.ll @@ -0,0 +1,574 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s + +; From <8 x i8> + +define <1 x i64> @test_v8i8_to_v1i64(<8 x i8> %in) nounwind { +; CHECK: test_v8i8_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v8i8_to_v2i32(<8 x i8> %in) nounwind { +; CHECK: test_v8i8_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v8i8_to_v1f32(<8 x i8> %in) nounwind{ +; CHECK: test_v8i8_to_v1f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v8i8_to_v4i16(<8 x i8> %in) nounwind{ +; CHECK: test_v8i8_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v8i8_to_v8i8(<8 x i8> %in) nounwind{ +; CHECK: test_v8i8_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i8> %in to <8 x i8> + ret <8 x i8> %val +} + +; From <4 x i16> + +define <1 x i64> @test_v4i16_to_v1i64(<4 x i16> %in) nounwind { +; CHECK: test_v4i16_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v4i16_to_v2i32(<4 x i16> %in) nounwind { +; CHECK: test_v4i16_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v4i16_to_v1f32(<4 x i16> %in) nounwind{ +; CHECK: test_v4i16_to_v1f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v4i16_to_v4i16(<4 x i16> %in) nounwind{ +; CHECK: test_v4i16_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v4i16_to_v8i8(<4 x i16> %in) nounwind{ +; CHECK: test_v4i16_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i16> %in to <8 x i8> + ret <8 x i8> %val +} + +; From <2 x i32> + +define <1 x i64> @test_v2i32_to_v1i64(<2 x i32> %in) nounwind { +; CHECK: test_v2i32_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v2i32_to_v2i32(<2 x i32> %in) nounwind { +; CHECK: test_v2i32_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v2i32_to_v1f32(<2 x i32> %in) nounwind{ +; CHECK: test_v2i32_to_v1f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v2i32_to_v4i16(<2 x i32> %in) nounwind{ +; CHECK: test_v2i32_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v2i32_to_v8i8(<2 x i32> %in) nounwind{ +; CHECK: test_v2i32_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i32> %in to <8 x i8> + ret <8 x i8> %val +} + +; From <2 x float> + +define <1 x i64> @test_v2f32_to_v1i64(<2 x float> %in) nounwind { +; CHECK: test_v2f32_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v2f32_to_v2i32(<2 x float> %in) nounwind { +; CHECK: test_v2f32_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v2f32_to_v2f32(<2 x float> %in) nounwind{ +; CHECK: test_v2f32_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v2f32_to_v4i16(<2 x float> %in) nounwind{ +; CHECK: test_v2f32_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v2f32_to_v8i8(<2 x float> %in) nounwind{ +; CHECK: test_v2f32_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x float> %in to <8 x i8> + ret <8 x i8> %val +} + +; From <1 x i64> + +define <1 x i64> @test_v1i64_to_v1i64(<1 x i64> %in) nounwind { +; CHECK: test_v1i64_to_v1i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <1 x i64> + ret <1 x i64> %val +} + +define <2 x i32> @test_v1i64_to_v2i32(<1 x i64> %in) nounwind { +; CHECK: test_v1i64_to_v2i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <2 x i32> + ret <2 x i32> %val +} + +define <2 x float> @test_v1i64_to_v2f32(<1 x i64> %in) nounwind{ +; CHECK: test_v1i64_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <2 x float> + ret <2 x float> %val +} + +define <4 x i16> @test_v1i64_to_v4i16(<1 x i64> %in) nounwind{ +; CHECK: test_v1i64_to_v4i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <4 x i16> + ret <4 x i16> %val +} + +define <8 x i8> @test_v1i64_to_v8i8(<1 x i64> %in) nounwind{ +; CHECK: test_v1i64_to_v8i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <1 x i64> %in to <8 x i8> + ret <8 x i8> %val +} + + +; From <16 x i8> + +define <2 x double> @test_v16i8_to_v2f64(<16 x i8> %in) nounwind { +; CHECK: test_v16i8_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v16i8_to_v2i64(<16 x i8> %in) nounwind { +; CHECK: test_v16i8_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v16i8_to_v4i32(<16 x i8> %in) nounwind { +; CHECK: test_v16i8_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v16i8_to_v2f32(<16 x i8> %in) nounwind{ +; CHECK: test_v16i8_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v16i8_to_v8i16(<16 x i8> %in) nounwind{ +; CHECK: test_v16i8_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v16i8_to_v16i8(<16 x i8> %in) nounwind{ +; CHECK: test_v16i8_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <16 x i8> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <8 x i16> + +define <2 x double> @test_v8i16_to_v2f64(<8 x i16> %in) nounwind { +; CHECK: test_v8i16_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v8i16_to_v2i64(<8 x i16> %in) nounwind { +; CHECK: test_v8i16_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v8i16_to_v4i32(<8 x i16> %in) nounwind { +; CHECK: test_v8i16_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v8i16_to_v2f32(<8 x i16> %in) nounwind{ +; CHECK: test_v8i16_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v8i16_to_v8i16(<8 x i16> %in) nounwind{ +; CHECK: test_v8i16_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v8i16_to_v16i8(<8 x i16> %in) nounwind{ +; CHECK: test_v8i16_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <8 x i16> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <4 x i32> + +define <2 x double> @test_v4i32_to_v2f64(<4 x i32> %in) nounwind { +; CHECK: test_v4i32_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v4i32_to_v2i64(<4 x i32> %in) nounwind { +; CHECK: test_v4i32_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v4i32_to_v4i32(<4 x i32> %in) nounwind { +; CHECK: test_v4i32_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v4i32_to_v2f32(<4 x i32> %in) nounwind{ +; CHECK: test_v4i32_to_v2f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v4i32_to_v8i16(<4 x i32> %in) nounwind{ +; CHECK: test_v4i32_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v4i32_to_v16i8(<4 x i32> %in) nounwind{ +; CHECK: test_v4i32_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x i32> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <4 x float> + +define <2 x double> @test_v4f32_to_v2f64(<4 x float> %in) nounwind { +; CHECK: test_v4f32_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v4f32_to_v2i64(<4 x float> %in) nounwind { +; CHECK: test_v4f32_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v4f32_to_v4i32(<4 x float> %in) nounwind { +; CHECK: test_v4f32_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v4f32_to_v4f32(<4 x float> %in) nounwind{ +; CHECK: test_v4f32_to_v4f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v4f32_to_v8i16(<4 x float> %in) nounwind{ +; CHECK: test_v4f32_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v4f32_to_v16i8(<4 x float> %in) nounwind{ +; CHECK: test_v4f32_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <4 x float> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <2 x i64> + +define <2 x double> @test_v2i64_to_v2f64(<2 x i64> %in) nounwind { +; CHECK: test_v2i64_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v2i64_to_v2i64(<2 x i64> %in) nounwind { +; CHECK: test_v2i64_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v2i64_to_v4i32(<2 x i64> %in) nounwind { +; CHECK: test_v2i64_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v2i64_to_v4f32(<2 x i64> %in) nounwind{ +; CHECK: test_v2i64_to_v4f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v2i64_to_v8i16(<2 x i64> %in) nounwind{ +; CHECK: test_v2i64_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v2i64_to_v16i8(<2 x i64> %in) nounwind{ +; CHECK: test_v2i64_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x i64> %in to <16 x i8> + ret <16 x i8> %val +} + +; From <2 x double> + +define <2 x double> @test_v2f64_to_v2f64(<2 x double> %in) nounwind { +; CHECK: test_v2f64_to_v2f64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <2 x double> + ret <2 x double> %val +} + +define <2 x i64> @test_v2f64_to_v2i64(<2 x double> %in) nounwind { +; CHECK: test_v2f64_to_v2i64: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <2 x i64> + ret <2 x i64> %val +} + +define <4 x i32> @test_v2f64_to_v4i32(<2 x double> %in) nounwind { +; CHECK: test_v2f64_to_v4i32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <4 x i32> + ret <4 x i32> %val +} + +define <4 x float> @test_v2f64_to_v4f32(<2 x double> %in) nounwind{ +; CHECK: test_v2f64_to_v4f32: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <4 x float> + ret <4 x float> %val +} + +define <8 x i16> @test_v2f64_to_v8i16(<2 x double> %in) nounwind{ +; CHECK: test_v2f64_to_v8i16: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <8 x i16> + ret <8 x i16> %val +} + +define <16 x i8> @test_v2f64_to_v16i8(<2 x double> %in) nounwind{ +; CHECK: test_v2f64_to_v16i8: +; CHECK-NEXT: // BB#0: +; CHECK-NEXT: ret + + %val = bitcast <2 x double> %in to <16 x i8> + ret <16 x i8> %val +} + diff --git a/test/CodeGen/AArch64/neon-bitwise-instructions.ll b/test/CodeGen/AArch64/neon-bitwise-instructions.ll new file mode 100644 index 0000000000..1c43b979fc --- /dev/null +++ b/test/CodeGen/AArch64/neon-bitwise-instructions.ll @@ -0,0 +1,594 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + + +define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <8 x i8> %a, %b; + ret <8 x i8> %tmp1 +} + +define <16 x i8> @and16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <16 x i8> %a, %b; + ret <16 x i8> %tmp1 +} + + +define <8 x i8> @orr8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = or <8 x i8> %a, %b; + ret <8 x i8> %tmp1 +} + +define <16 x i8> @orr16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = or <16 x i8> %a, %b; + ret <16 x i8> %tmp1 +} + + +define <8 x i8> @xor8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <8 x i8> %a, %b; + ret <8 x i8> %tmp1 +} + +define <16 x i8> @xor16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <16 x i8> %a, %b; + ret <16 x i8> %tmp1 +} + +define <8 x i8> @bsl8xi8_const(<8 x i8> %a, <8 x i8> %b) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <8 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = and <8 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 > + %tmp3 = or <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <16 x i8> @bsl16xi8_const(<16 x i8> %a, <16 x i8> %b) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <16 x i8> %a, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = and <16 x i8> %b, < i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0 > + %tmp3 = or <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i8> @orn8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = or <8 x i8> %a, %tmp1 + ret <8 x i8> %tmp2 +} + +define <16 x i8> @orn16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = or <16 x i8> %a, %tmp1 + ret <16 x i8> %tmp2 +} + +define <8 x i8> @bic8xi8(<8 x i8> %a, <8 x i8> %b) { +;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <8 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = and <8 x i8> %a, %tmp1 + ret <8 x i8> %tmp2 +} + +define <16 x i8> @bic16xi8(<16 x i8> %a, <16 x i8> %b) { +;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <16 x i8> %b, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp2 = and <16 x i8> %a, %tmp1 + ret <16 x i8> %tmp2 +} + +define <2 x i32> @orrimm2s_lsl0(<2 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.2s, #0xff + %tmp1 = or <2 x i32> %a, < i32 255, i32 255> + ret <2 x i32> %tmp1 +} + +define <2 x i32> @orrimm2s_lsl8(<2 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #8 + %tmp1 = or <2 x i32> %a, < i32 65280, i32 65280> + ret <2 x i32> %tmp1 +} + +define <2 x i32> @orrimm2s_lsl16(<2 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #16 + %tmp1 = or <2 x i32> %a, < i32 16711680, i32 16711680> + ret <2 x i32> %tmp1 +} + +define <2 x i32> @orrimm2s_lsl24(<2 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.2s, #0xff, lsl #24 + %tmp1 = or <2 x i32> %a, < i32 4278190080, i32 4278190080> + ret <2 x i32> %tmp1 +} + +define <4 x i32> @orrimm4s_lsl0(<4 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.4s, #0xff + %tmp1 = or <4 x i32> %a, < i32 255, i32 255, i32 255, i32 255> + ret <4 x i32> %tmp1 +} + +define <4 x i32> @orrimm4s_lsl8(<4 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #8 + %tmp1 = or <4 x i32> %a, < i32 65280, i32 65280, i32 65280, i32 65280> + ret <4 x i32> %tmp1 +} + +define <4 x i32> @orrimm4s_lsl16(<4 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #16 + %tmp1 = or <4 x i32> %a, < i32 16711680, i32 16711680, i32 16711680, i32 16711680> + ret <4 x i32> %tmp1 +} + +define <4 x i32> @orrimm4s_lsl24(<4 x i32> %a) { +;CHECK: orr {{v[0-31]+}}.4s, #0xff, lsl #24 + %tmp1 = or <4 x i32> %a, < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080> + ret <4 x i32> %tmp1 +} + +define <4 x i16> @orrimm4h_lsl0(<4 x i16> %a) { +;CHECK: orr {{v[0-31]+}}.4h, #0xff + %tmp1 = or <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255 > + ret <4 x i16> %tmp1 +} + +define <4 x i16> @orrimm4h_lsl8(<4 x i16> %a) { +;CHECK: orr {{v[0-31]+}}.4h, #0xff, lsl #8 + %tmp1 = or <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 > + ret <4 x i16> %tmp1 +} + +define <8 x i16> @orrimm8h_lsl0(<8 x i16> %a) { +;CHECK: orr {{v[0-31]+}}.8h, #0xff + %tmp1 = or <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 > + ret <8 x i16> %tmp1 +} + +define <8 x i16> @orrimm8h_lsl8(<8 x i16> %a) { +;CHECK: orr {{v[0-31]+}}.8h, #0xff, lsl #8 + %tmp1 = or <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 > + ret <8 x i16> %tmp1 +} + +define <2 x i32> @bicimm2s_lsl0(<2 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.2s, #0x10 + %tmp1 = and <2 x i32> %a, < i32 4294967279, i32 4294967279 > + ret <2 x i32> %tmp1 +} + +define <2 x i32> @bicimm2s_lsl8(<2 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #8 + %tmp1 = and <2 x i32> %a, < i32 18446744073709547519, i32 18446744073709547519 > + ret <2 x i32> %tmp1 +} + +define <2 x i32> @bicimm2s_lsl16(<2 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #16 + %tmp1 = and <2 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039 > + ret <2 x i32> %tmp1 +} + +define <2 x i32> @bicimm2s_lsl124(<2 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.2s, #0x10, lsl #24 + %tmp1 = and <2 x i32> %a, < i32 18446744073441116159, i32 18446744073441116159> + ret <2 x i32> %tmp1 +} + +define <4 x i32> @bicimm4s_lsl0(<4 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.4s, #0x10 + %tmp1 = and <4 x i32> %a, < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 > + ret <4 x i32> %tmp1 +} + +define <4 x i32> @bicimm4s_lsl8(<4 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #8 + %tmp1 = and <4 x i32> %a, < i32 18446744073709547519, i32 18446744073709547519, i32 18446744073709547519, i32 18446744073709547519 > + ret <4 x i32> %tmp1 +} + +define <4 x i32> @bicimm4s_lsl16(<4 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #16 + %tmp1 = and <4 x i32> %a, < i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039, i32 18446744073708503039 > + ret <4 x i32> %tmp1 +} + +define <4 x i32> @bicimm4s_lsl124(<4 x i32> %a) { +;CHECK: bic {{v[0-31]+}}.4s, #0x10, lsl #24 + %tmp1 = and <4 x i32> %a, < i32 18446744073441116159, i32 18446744073441116159, i32 18446744073441116159, i32 18446744073441116159> + ret <4 x i32> %tmp1 +} + +define <4 x i16> @bicimm4h_lsl0_a(<4 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.4h, #0x10 + %tmp1 = and <4 x i16> %a, < i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599 > + ret <4 x i16> %tmp1 +} + +define <4 x i16> @bicimm4h_lsl0_b(<4 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.4h, #0x0 + %tmp1 = and <4 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280 > + ret <4 x i16> %tmp1 +} + +define <4 x i16> @bicimm4h_lsl8_a(<4 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.4h, #0x10, lsl #8 + %tmp1 = and <4 x i16> %a, < i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519> + ret <4 x i16> %tmp1 +} + +define <4 x i16> @bicimm4h_lsl8_b(<4 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.4h, #0x0, lsl #8 + %tmp1 = and <4 x i16> %a, < i16 255, i16 255, i16 255, i16 255> + ret <4 x i16> %tmp1 +} + +define <8 x i16> @bicimm8h_lsl0_a(<8 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.8h, #0x10 + %tmp1 = and <8 x i16> %a, < i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, + i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599, i16 18446744073709551599 > + ret <8 x i16> %tmp1 +} + +define <8 x i16> @bicimm8h_lsl0_b(<8 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.8h, #0x0 + %tmp1 = and <8 x i16> %a, < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 > + ret <8 x i16> %tmp1 +} + +define <8 x i16> @bicimm8h_lsl8_a(<8 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.8h, #0x10, lsl #8 + %tmp1 = and <8 x i16> %a, < i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, + i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519, i16 18446744073709547519> + ret <8 x i16> %tmp1 +} + +define <8 x i16> @bicimm8h_lsl8_b(<8 x i16> %a) { +;CHECK: bic {{v[0-31]+}}.8h, #0x0, lsl #8 + %tmp1 = and <8 x i16> %a, < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255> + ret <8 x i16> %tmp1 +} + +define <2 x i32> @and2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <2 x i32> %a, %b; + ret <2 x i32> %tmp1 +} + +define <4 x i16> @and4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <4 x i16> %a, %b; + ret <4 x i16> %tmp1 +} + +define <1 x i64> @and1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: and {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <1 x i64> %a, %b; + ret <1 x i64> %tmp1 +} + +define <4 x i32> @and4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <4 x i32> %a, %b; + ret <4 x i32> %tmp1 +} + +define <8 x i16> @and8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <8 x i16> %a, %b; + ret <8 x i16> %tmp1 +} + +define <2 x i64> @and2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: and {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <2 x i64> %a, %b; + ret <2 x i64> %tmp1 +} + +define <2 x i32> @orr2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = or <2 x i32> %a, %b; + ret <2 x i32> %tmp1 +} + +define <4 x i16> @orr4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = or <4 x i16> %a, %b; + ret <4 x i16> %tmp1 +} + +define <1 x i64> @orr1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: orr {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = or <1 x i64> %a, %b; + ret <1 x i64> %tmp1 +} + +define <4 x i32> @orr4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = or <4 x i32> %a, %b; + ret <4 x i32> %tmp1 +} + +define <8 x i16> @orr8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = or <8 x i16> %a, %b; + ret <8 x i16> %tmp1 +} + +define <2 x i64> @orr2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: orr {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = or <2 x i64> %a, %b; + ret <2 x i64> %tmp1 +} + +define <2 x i32> @eor2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <2 x i32> %a, %b; + ret <2 x i32> %tmp1 +} + +define <4 x i16> @eor4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <4 x i16> %a, %b; + ret <4 x i16> %tmp1 +} + +define <1 x i64> @eor1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: eor {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <1 x i64> %a, %b; + ret <1 x i64> %tmp1 +} + +define <4 x i32> @eor4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <4 x i32> %a, %b; + ret <4 x i32> %tmp1 +} + +define <8 x i16> @eor8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <8 x i16> %a, %b; + ret <8 x i16> %tmp1 +} + +define <2 x i64> @eor2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: eor {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <2 x i64> %a, %b; + ret <2 x i64> %tmp1 +} + + +define <2 x i32> @bic2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 > + %tmp2 = and <2 x i32> %a, %tmp1 + ret <2 x i32> %tmp2 +} + +define <4 x i16> @bic4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 > + %tmp2 = and <4 x i16> %a, %tmp1 + ret <4 x i16> %tmp2 +} + +define <1 x i64> @bic1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: bic {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <1 x i64> %b, < i64 -1> + %tmp2 = and <1 x i64> %a, %tmp1 + ret <1 x i64> %tmp2 +} + +define <4 x i32> @bic4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1> + %tmp2 = and <4 x i32> %a, %tmp1 + ret <4 x i32> %tmp2 +} + +define <8 x i16> @bic8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp2 = and <8 x i16> %a, %tmp1 + ret <8 x i16> %tmp2 +} + +define <2 x i64> @bic2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: bic {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1> + %tmp2 = and <2 x i64> %a, %tmp1 + ret <2 x i64> %tmp2 +} + +define <2 x i32> @orn2xi32(<2 x i32> %a, <2 x i32> %b) { +;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <2 x i32> %b, < i32 -1, i32 -1 > + %tmp2 = or <2 x i32> %a, %tmp1 + ret <2 x i32> %tmp2 +} + +define <4 x i16> @orn4xi16(<4 x i16> %a, <4 x i16> %b) { +;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <4 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1 > + %tmp2 = or <4 x i16> %a, %tmp1 + ret <4 x i16> %tmp2 +} + +define <1 x i64> @orn1xi64(<1 x i64> %a, <1 x i64> %b) { +;CHECK: orn {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = xor <1 x i64> %b, < i64 -1> + %tmp2 = or <1 x i64> %a, %tmp1 + ret <1 x i64> %tmp2 +} + +define <4 x i32> @orn4xi32(<4 x i32> %a, <4 x i32> %b) { +;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <4 x i32> %b, < i32 -1, i32 -1, i32 -1, i32 -1> + %tmp2 = or <4 x i32> %a, %tmp1 + ret <4 x i32> %tmp2 +} + +define <8 x i16> @orn8xi16(<8 x i16> %a, <8 x i16> %b) { +;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <8 x i16> %b, < i16 -1, i16 -1, i16 -1, i16-1, i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp2 = or <8 x i16> %a, %tmp1 + ret <8 x i16> %tmp2 +} + +define <2 x i64> @orn2xi64(<2 x i64> %a, <2 x i64> %b) { +;CHECK: orn {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = xor <2 x i64> %b, < i64 -1, i64 -1> + %tmp2 = or <2 x i64> %a, %tmp1 + ret <2 x i64> %tmp2 +} +define <2 x i32> @bsl2xi32_const(<2 x i32> %a, <2 x i32> %b) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <2 x i32> %a, < i32 -1, i32 -1 > + %tmp2 = and <2 x i32> %b, < i32 0, i32 0 > + %tmp3 = or <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + + +define <4 x i16> @bsl4xi16_const(<4 x i16> %a, <4 x i16> %b) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <4 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1 > + %tmp2 = and <4 x i16> %b, < i16 0, i16 0,i16 0, i16 0 > + %tmp3 = or <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <1 x i64> @bsl1xi64_const(<1 x i64> %a, <1 x i64> %b) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = and <1 x i64> %a, < i64 -1 > + %tmp2 = and <1 x i64> %b, < i64 0 > + %tmp3 = or <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <4 x i32> @bsl4xi32_const(<4 x i32> %a, <4 x i32> %b) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <4 x i32> %a, < i32 -1, i32 -1, i32 -1, i32 -1 > + %tmp2 = and <4 x i32> %b, < i32 0, i32 0, i32 0, i32 0 > + %tmp3 = or <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <8 x i16> @bsl8xi16_const(<8 x i16> %a, <8 x i16> %b) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <8 x i16> %a, < i16 -1, i16 -1, i16 -1,i16 -1, i16 -1, i16 -1, i16 -1,i16 -1 > + %tmp2 = and <8 x i16> %b, < i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0 > + %tmp3 = or <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <2 x i64> @bsl2xi64_const(<2 x i64> %a, <2 x i64> %b) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = and <2 x i64> %a, < i64 -1, i64 -1 > + %tmp2 = and <2 x i64> %b, < i64 0, i64 0 > + %tmp3 = or <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} + + +define <8 x i8> @bsl8xi8(<8 x i8> %v1, <8 x i8> %v2, <8 x i8> %v3) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %1 = and <8 x i8> %v1, %v2 + %2 = xor <8 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + %3 = and <8 x i8> %2, %v3 + %4 = or <8 x i8> %1, %3 + ret <8 x i8> %4 +} + +define <4 x i16> @bsl4xi16(<4 x i16> %v1, <4 x i16> %v2, <4 x i16> %v3) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %1 = and <4 x i16> %v1, %v2 + %2 = xor <4 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1> + %3 = and <4 x i16> %2, %v3 + %4 = or <4 x i16> %1, %3 + ret <4 x i16> %4 +} + +define <2 x i32> @bsl2xi32(<2 x i32> %v1, <2 x i32> %v2, <2 x i32> %v3) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %1 = and <2 x i32> %v1, %v2 + %2 = xor <2 x i32> %v1, <i32 -1, i32 -1> + %3 = and <2 x i32> %2, %v3 + %4 = or <2 x i32> %1, %3 + ret <2 x i32> %4 +} + +define <1 x i64> @bsl1xi64(<1 x i64> %v1, <1 x i64> %v2, <1 x i64> %v3) { +;CHECK: bsl {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %1 = and <1 x i64> %v1, %v2 + %2 = xor <1 x i64> %v1, <i64 -1> + %3 = and <1 x i64> %2, %v3 + %4 = or <1 x i64> %1, %3 + ret <1 x i64> %4 +} + +define <16 x i8> @bsl16xi8(<16 x i8> %v1, <16 x i8> %v2, <16 x i8> %v3) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %1 = and <16 x i8> %v1, %v2 + %2 = xor <16 x i8> %v1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> + %3 = and <16 x i8> %2, %v3 + %4 = or <16 x i8> %1, %3 + ret <16 x i8> %4 +} + +define <8 x i16> @bsl8xi16(<8 x i16> %v1, <8 x i16> %v2, <8 x i16> %v3) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %1 = and <8 x i16> %v1, %v2 + %2 = xor <8 x i16> %v1, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1> + %3 = and <8 x i16> %2, %v3 + %4 = or <8 x i16> %1, %3 + ret <8 x i16> %4 +} + +define <4 x i32> @bsl4xi32(<4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %1 = and <4 x i32> %v1, %v2 + %2 = xor <4 x i32> %v1, <i32 -1, i32 -1, i32 -1, i32 -1> + %3 = and <4 x i32> %2, %v3 + %4 = or <4 x i32> %1, %3 + ret <4 x i32> %4 +} + +define <2 x i64> @bsl2xi64(<2 x i64> %v1, <2 x i64> %v2, <2 x i64> %v3) { +;CHECK: bsl {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %1 = and <2 x i64> %v1, %v2 + %2 = xor <2 x i64> %v1, <i64 -1, i64 -1> + %3 = and <2 x i64> %2, %v3 + %4 = or <2 x i64> %1, %3 + ret <2 x i64> %4 +} + +define <8 x i8> @orrimm8b_as_orrimm4h_lsl0(<8 x i8> %a) { +;CHECK: orr {{v[0-31]+}}.4h, #0xff + %val = or <8 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0> + ret <8 x i8> %val +} + +define <8 x i8> @orrimm8b_as_orimm4h_lsl8(<8 x i8> %a) { +;CHECK: orr {{v[0-31]+}}.4h, #0xff, lsl #8 + %val = or <8 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255> + ret <8 x i8> %val +} + +define <16 x i8> @orimm16b_as_orrimm8h_lsl0(<16 x i8> %a) { +;CHECK: orr {{v[0-31]+}}.8h, #0xff + %val = or <16 x i8> %a, <i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0> + ret <16 x i8> %val +} + +define <16 x i8> @orimm16b_as_orrimm8h_lsl8(<16 x i8> %a) { +;CHECK: orr {{v[0-31]+}}.8h, #0xff, lsl #8 + %val = or <16 x i8> %a, <i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255, i8 0, i8 255> + ret <16 x i8> %val +} + + diff --git a/test/CodeGen/AArch64/neon-compare-instructions.ll b/test/CodeGen/AArch64/neon-compare-instructions.ll new file mode 100644 index 0000000000..0848f9b03d --- /dev/null +++ b/test/CodeGen/AArch64/neon-compare-instructions.ll @@ -0,0 +1,1982 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp eq <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp eq <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp eq <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp eq <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp eq <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp eq <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp eq <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp sgt <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp sgt <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp sgt <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp sgt <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp sgt <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp sgt <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp sgt <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp slt <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp slt <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp slt <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp slt <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp slt <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp slt <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LT implemented as GT, so check reversed operands. +;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp slt <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp sge <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp sge <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp sge <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp sge <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp sge <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp sge <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp sge <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp sle <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp sle <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp sle <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp sle <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp sle <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp sle <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LE implemented as GE, so check reversed operands. +;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp sle <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ugt <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ugt <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp ugt <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp ugt <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp ugt <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp ugt <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp ugt <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp ult <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp ult <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp ult <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp ult <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp ult <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp ult <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp ult <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp uge <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp uge <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp uge <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp uge <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp uge <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp uge <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp uge <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp ule <8 x i8> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp ule <16 x i8> %A, %B; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp ule <4 x i16> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp ule <8 x i16> %A, %B; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp ule <2 x i32> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp ule <4 x i32> %A, %B; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp ule <2 x i64> %A, %B; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmtst8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: cmtst {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = and <8 x i8> %A, %B + %tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer + %tmp5 = sext <8 x i1> %tmp4 to <8 x i8> + ret <8 x i8> %tmp5 +} + +define <16 x i8> @cmtst16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: cmtst {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = and <16 x i8> %A, %B + %tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer + %tmp5 = sext <16 x i1> %tmp4 to <16 x i8> + ret <16 x i8> %tmp5 +} + +define <4 x i16> @cmtst4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: cmtst {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = and <4 x i16> %A, %B + %tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer + %tmp5 = sext <4 x i1> %tmp4 to <4 x i16> + ret <4 x i16> %tmp5 +} + +define <8 x i16> @cmtst8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: cmtst {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = and <8 x i16> %A, %B + %tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer + %tmp5 = sext <8 x i1> %tmp4 to <8 x i16> + ret <8 x i16> %tmp5 +} + +define <2 x i32> @cmtst2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: cmtst {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = and <2 x i32> %A, %B + %tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer + %tmp5 = sext <2 x i1> %tmp4 to <2 x i32> + ret <2 x i32> %tmp5 +} + +define <4 x i32> @cmtst4xi32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: cmtst {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = and <4 x i32> %A, %B + %tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer + %tmp5 = sext <4 x i1> %tmp4 to <4 x i32> + ret <4 x i32> %tmp5 +} + +define <2 x i64> @cmtst2xi64(<2 x i64> %A, <2 x i64> %B) { +;CHECK: cmtst {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = and <2 x i64> %A, %B + %tmp4 = icmp ne <2 x i64> %tmp3, zeroinitializer + %tmp5 = sext <2 x i1> %tmp4 to <2 x i64> + ret <2 x i64> %tmp5 +} + + + +define <8 x i8> @cmeqz8xi8(<8 x i8> %A) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp eq <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmeqz16xi8(<16 x i8> %A) { +;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp eq <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmeqz4xi16(<4 x i16> %A) { +;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp eq <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmeqz8xi16(<8 x i16> %A) { +;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp eq <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmeqz2xi32(<2 x i32> %A) { +;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp eq <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmeqz4xi32(<4 x i32> %A) { +;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp eq <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmeqz2xi64(<2 x i64> %A) { +;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp eq <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <8 x i8> @cmgez8xi8(<8 x i8> %A) { +;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp sge <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmgez16xi8(<16 x i8> %A) { +;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp sge <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmgez4xi16(<4 x i16> %A) { +;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp sge <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmgez8xi16(<8 x i16> %A) { +;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp sge <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmgez2xi32(<2 x i32> %A) { +;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp sge <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmgez4xi32(<4 x i32> %A) { +;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp sge <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmgez2xi64(<2 x i64> %A) { +;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp sge <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <8 x i8> @cmgtz8xi8(<8 x i8> %A) { +;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp sgt <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmgtz16xi8(<16 x i8> %A) { +;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp sgt <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmgtz4xi16(<4 x i16> %A) { +;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp sgt <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmgtz8xi16(<8 x i16> %A) { +;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp sgt <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmgtz2xi32(<2 x i32> %A) { +;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp sgt <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmgtz4xi32(<4 x i32> %A) { +;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp sgt <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmgtz2xi64(<2 x i64> %A) { +;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp sgt <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmlez8xi8(<8 x i8> %A) { +;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp sle <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmlez16xi8(<16 x i8> %A) { +;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp sle <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmlez4xi16(<4 x i16> %A) { +;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp sle <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmlez8xi16(<8 x i16> %A) { +;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp sle <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmlez2xi32(<2 x i32> %A) { +;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp sle <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmlez4xi32(<4 x i32> %A) { +;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp sle <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmlez2xi64(<2 x i64> %A) { +;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp sle <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmltz8xi8(<8 x i8> %A) { +;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 + %tmp3 = icmp slt <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmltz16xi8(<16 x i8> %A) { +;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 + %tmp3 = icmp slt <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmltz4xi16(<4 x i16> %A) { +;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 + %tmp3 = icmp slt <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmltz8xi16(<8 x i16> %A) { +;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 + %tmp3 = icmp slt <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmltz2xi32(<2 x i32> %A) { +;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 + %tmp3 = icmp slt <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmltz4xi32(<4 x i32> %A) { +;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 + %tmp3 = icmp slt <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmltz2xi64(<2 x i64> %A) { +;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 + %tmp3 = icmp slt <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmneqz8xi8(<8 x i8> %A) { +;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmneqz16xi8(<16 x i8> %A) { +;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmneqz4xi16(<4 x i16> %A) { +;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmneqz8xi16(<8 x i16> %A) { +;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmneqz2xi32(<2 x i32> %A) { +;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ne <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmneqz4xi32(<4 x i32> %A) { +;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmneqz2xi64(<2 x i64> %A) { +;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0x0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ne <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmhsz8xi8(<8 x i8> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp uge <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmhsz16xi8(<16 x i8> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp uge <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmhsz4xi16(<4 x i16> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp uge <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmhsz8xi16(<8 x i16> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp uge <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmhsz2xi32(<2 x i32> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp uge <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmhsz4xi32(<4 x i32> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp uge <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmhsz2xi64(<2 x i64> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp uge <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <8 x i8> @cmhiz8xi8(<8 x i8> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ugt <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmhiz16xi8(<16 x i8> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = icmp ugt <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmhiz4xi16(<4 x i16> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h + %tmp3 = icmp ugt <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmhiz8xi16(<8 x i16> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h + %tmp3 = icmp ugt <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmhiz2xi32(<2 x i32> %A) { +;CHECK: movi {{v[0-9]+}}.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = icmp ugt <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmhiz4xi32(<4 x i32> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = icmp ugt <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmhiz2xi64(<2 x i64> %A) { +;CHECK: movi {{v[0-9]+}}.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = icmp ugt <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmlsz8xi8(<8 x i8> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b + %tmp3 = icmp ule <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmlsz16xi8(<16 x i8> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp ule <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmlsz4xi16(<4 x i16> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp ule <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmlsz8xi16(<8 x i16> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp ule <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmlsz2xi32(<2 x i32> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp ule <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmlsz4xi32(<4 x i32> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp ule <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmlsz2xi64(<2 x i64> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LS implemented as HS, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp ule <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <8 x i8> @cmloz8xi8(<8 x i8> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v1.8b, {{v[0-9]+}}.8b + %tmp3 = icmp ult <8 x i8> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i8> + ret <8 x i8> %tmp4 +} + +define <16 x i8> @cmloz16xi8(<16 x i8> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b + %tmp3 = icmp ult <16 x i8> %A, zeroinitializer; + %tmp4 = sext <16 x i1> %tmp3 to <16 x i8> + ret <16 x i8> %tmp4 +} + +define <4 x i16> @cmloz4xi16(<4 x i16> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h + %tmp3 = icmp ult <4 x i16> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i16> + ret <4 x i16> %tmp4 +} + +define <8 x i16> @cmloz8xi16(<8 x i16> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h + %tmp3 = icmp ult <8 x i16> %A, zeroinitializer; + %tmp4 = sext <8 x i1> %tmp3 to <8 x i16> + ret <8 x i16> %tmp4 +} + +define <2 x i32> @cmloz2xi32(<2 x i32> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.8b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = icmp ult <2 x i32> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @cmloz4xi32(<4 x i32> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = icmp ult <4 x i32> %A, zeroinitializer; + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @cmloz2xi64(<2 x i64> %A) { +; Using registers other than v0, v1 are possible, but would be odd. +; LO implemented as HI, so check reversed operands. +;CHECK: movi v1.16b, #0x0 +;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = icmp ult <2 x i64> %A, zeroinitializer; + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmoeq2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = fcmp oeq <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmoeq4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = fcmp oeq <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmoeq2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = fcmp oeq <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmoge2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = fcmp oge <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmoge4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = fcmp oge <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmoge2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = fcmp oge <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmogt2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + %tmp3 = fcmp ogt <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmogt4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + %tmp3 = fcmp ogt <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmogt2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + %tmp3 = fcmp ogt <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmole2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = fcmp ole <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmole4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = fcmp ole <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmole2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = fcmp ole <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmolt2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s + %tmp3 = fcmp olt <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmolt4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s + %tmp3 = fcmp olt <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmolt2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; OLE implemented as OGE, so check reversed operands. +;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d + %tmp3 = fcmp olt <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmone2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp one <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmone4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp one <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmone2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ONE = OGT | OLT, OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +; todo check reversed operands + %tmp3 = fcmp one <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmord2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ord <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + + +define <4 x i32> @fcmord4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ord <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmord2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ORD = OGE | OLT, OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ord <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmuno2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp uno <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmuno4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uno <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmuno2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNO = !(OGE | OLT), OLT implemented as OGT, so check reversed operands. +;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uno <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmueq2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ueq <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmueq4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ueq <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmueq2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UEQ = !ONE = !(OGT | OLT), OLT implemented as OGT so check reversed operands +;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ueq <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmuge2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGE = ULE with swapped operands, ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp uge <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmuge4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGE = ULE with swapped operands, ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uge <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmuge2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGE = ULE with swapped operands, ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uge <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmugt2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGT = ULT with swapped operands, ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.2s, v1.2s, v0.2s +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ugt <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmugt4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UGT = ULT with swapped operands, ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.4s, v1.4s, v0.4s +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ugt <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmugt2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fcmge {{v[0-9]+}}.2d, v1.2d, v0.2d +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ugt <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmule2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ule <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmule4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ule <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmule2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULE implemented as !OGT. +;CHECK: fcmgt {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ule <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmult2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ult <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmult4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ult <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmult2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; ULT implemented as !OGE. +;CHECK: fcmge {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ult <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmune2xfloat(<2 x float> %A, <2 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNE = !OEQ. +;CHECK: fcmeq {{v[0-9]+}}.2s, v0.2s, v1.2s +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp une <2 x float> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmune4xfloat(<4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNE = !OEQ. +;CHECK: fcmeq {{v[0-9]+}}.4s, v0.4s, v1.4s +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp une <4 x float> %A, %B + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmune2xdouble(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 are possible, but would be odd. +; UNE = !OEQ. +;CHECK: fcmeq {{v[0-9]+}}.2d, v0.2d, v1.2d +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp une <2 x double> %A, %B + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmoeqz2xfloat(<2 x float> %A) { +;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp oeq <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmoeqz4xfloat(<4 x float> %A) { +;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp oeq <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmoeqz2xdouble(<2 x double> %A) { +;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp oeq <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmogez2xfloat(<2 x float> %A) { +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp oge <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmogez4xfloat(<4 x float> %A) { +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp oge <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmogez2xdouble(<2 x double> %A) { +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp oge <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmogtz2xfloat(<2 x float> %A) { +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp ogt <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmogtz4xfloat(<4 x float> %A) { +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp ogt <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmogtz2xdouble(<2 x double> %A) { +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp ogt <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmoltz2xfloat(<2 x float> %A) { +;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp olt <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmoltz4xfloat(<4 x float> %A) { +;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp olt <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmoltz2xdouble(<2 x double> %A) { +;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp olt <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmolez2xfloat(<2 x float> %A) { +;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 + %tmp3 = fcmp ole <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmolez4xfloat(<4 x float> %A) { +;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 + %tmp3 = fcmp ole <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmolez2xdouble(<2 x double> %A) { +;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 + %tmp3 = fcmp ole <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmonez2xfloat(<2 x float> %A) { +; ONE with zero = OLT | OGT +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp one <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmonez4xfloat(<4 x float> %A) { +; ONE with zero = OLT | OGT +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp one <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmonez2xdouble(<2 x double> %A) { +; ONE with zero = OLT | OGT +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp one <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmordz2xfloat(<2 x float> %A) { +; ORD with zero = OLT | OGE +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ord <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmordz4xfloat(<4 x float> %A) { +; ORD with zero = OLT | OGE +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ord <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmordz2xdouble(<2 x double> %A) { +; ORD with zero = OLT | OGE +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ord <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmueqz2xfloat(<2 x float> %A) { +; UEQ with zero = !ONE = !(OLT |OGT) +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ueq <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmueqz4xfloat(<4 x float> %A) { +; UEQ with zero = !ONE = !(OLT |OGT) +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ueq <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmueqz2xdouble(<2 x double> %A) { +; UEQ with zero = !ONE = !(OLT |OGT) +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ueq <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmugez2xfloat(<2 x float> %A) { +; UGE with zero = !OLT +;CHECK: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp uge <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmugez4xfloat(<4 x float> %A) { +; UGE with zero = !OLT +;CHECK: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uge <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmugez2xdouble(<2 x double> %A) { +; UGE with zero = !OLT +;CHECK: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uge <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmugtz2xfloat(<2 x float> %A) { +; UGT with zero = !OLE +;CHECK: fcmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ugt <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmugtz4xfloat(<4 x float> %A) { +; UGT with zero = !OLE +;CHECK: fcmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ugt <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmugtz2xdouble(<2 x double> %A) { +; UGT with zero = !OLE +;CHECK: fcmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ugt <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmultz2xfloat(<2 x float> %A) { +; ULT with zero = !OGE +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ult <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmultz4xfloat(<4 x float> %A) { +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ult <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmultz2xdouble(<2 x double> %A) { +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ult <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmulez2xfloat(<2 x float> %A) { +; ULE with zero = !OGT +;CHECK: fcmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp ule <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmulez4xfloat(<4 x float> %A) { +; ULE with zero = !OGT +;CHECK: fcmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ule <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmulez2xdouble(<2 x double> %A) { +; ULE with zero = !OGT +;CHECK: fcmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp ule <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + +define <2 x i32> @fcmunez2xfloat(<2 x float> %A) { +; UNE with zero = !OEQ with zero +;CHECK: fcmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp une <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmunez4xfloat(<4 x float> %A) { +; UNE with zero = !OEQ with zero +;CHECK: fcmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp une <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} +define <2 x i64> @fcmunez2xdouble(<2 x double> %A) { +; UNE with zero = !OEQ with zero +;CHECK: fcmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp une <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 +} + + +define <2 x i32> @fcmunoz2xfloat(<2 x float> %A) { +; UNO with zero = !ORD = !(OLT | OGE) +;CHECK: fcmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b +;CHECK-NEXT: movi {{v[0-9]+}}.8b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b + %tmp3 = fcmp uno <2 x float> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i32> + ret <2 x i32> %tmp4 +} + +define <4 x i32> @fcmunoz4xfloat(<4 x float> %A) { +; UNO with zero = !ORD = !(OLT | OGE) +;CHECK: fcmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uno <4 x float> %A, zeroinitializer + %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> + ret <4 x i32> %tmp4 +} + +define <2 x i64> @fcmunoz2xdouble(<2 x double> %A) { +; UNO with zero = !ORD = !(OLT | OGE) +;CHECK: fcmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: fcmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0.0 +;CHECK-NEXT: orr {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b +;CHECK-NEXT: movi {{v[0-9]+}}.16b, #0xff +;CHECK-NEXT: eor {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b + %tmp3 = fcmp uno <2 x double> %A, zeroinitializer + %tmp4 = sext <2 x i1> %tmp3 to <2 x i64> + ret <2 x i64> %tmp4 + +} diff --git a/test/CodeGen/AArch64/neon-facge-facgt.ll b/test/CodeGen/AArch64/neon-facge-facgt.ll new file mode 100644 index 0000000000..146256e4be --- /dev/null +++ b/test/CodeGen/AArch64/neon-facge-facgt.ll @@ -0,0 +1,56 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) +declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) +declare <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double>, <2 x double>) + +define <2 x i32> @facge_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facge_from_intr_v2i32: + %val = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %A, <2 x float> %B) +; CHECK: facge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + ret <2 x i32> %val +} +define <4 x i32> @facge_from_intr_v4i32( <4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facge_from_intr_v4i32: + %val = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %A, <4 x float> %B) +; CHECK: facge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + ret <4 x i32> %val +} + +define <2 x i64> @facge_from_intr_v2i64(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facge_from_intr_v2i64: + %val = call <2 x i64> @llvm.aarch64.neon.vacgeq(<2 x double> %A, <2 x double> %B) +; CHECK: facge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + ret <2 x i64> %val +} + +declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) +declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) +declare <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double>, <2 x double>) + +define <2 x i32> @facgt_from_intr_v2i32(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facgt_from_intr_v2i32: + %val = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %A, <2 x float> %B) +; CHECK: facgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s + ret <2 x i32> %val +} +define <4 x i32> @facgt_from_intr_v4i32( <4 x float> %A, <4 x float> %B) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facgt_from_intr_v4i32: + %val = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %A, <4 x float> %B) +; CHECK: facgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s + ret <4 x i32> %val +} + +define <2 x i64> @facgt_from_intr_v2i64(<2 x double> %A, <2 x double> %B) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: facgt_from_intr_v2i64: + %val = call <2 x i64> @llvm.aarch64.neon.vacgtq(<2 x double> %A, <2 x double> %B) +; CHECK: facgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d + ret <2 x i64> %val +} + diff --git a/test/CodeGen/AArch64/neon-fma.ll b/test/CodeGen/AArch64/neon-fma.ll new file mode 100644 index 0000000000..dcf4e28780 --- /dev/null +++ b/test/CodeGen/AArch64/neon-fma.ll @@ -0,0 +1,112 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +define <2 x float> @fmla2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp1 = fmul <2 x float> %A, %B; + %tmp2 = fadd <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} + +define <4 x float> @fmla4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp1 = fmul <4 x float> %A, %B; + %tmp2 = fadd <4 x float> %C, %tmp1; + ret <4 x float> %tmp2 +} + +define <2 x double> @fmla2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp1 = fmul <2 x double> %A, %B; + %tmp2 = fadd <2 x double> %C, %tmp1; + ret <2 x double> %tmp2 +} + + +define <2 x float> @fmls2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp1 = fmul <2 x float> %A, %B; + %tmp2 = fsub <2 x float> %C, %tmp1; + ret <2 x float> %tmp2 +} + +define <4 x float> @fmls4xfloat(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp1 = fmul <4 x float> %A, %B; + %tmp2 = fsub <4 x float> %C, %tmp1; + ret <4 x float> %tmp2 +} + +define <2 x double> @fmls2xdouble(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp1 = fmul <2 x double> %A, %B; + %tmp2 = fsub <2 x double> %C, %tmp1; + ret <2 x double> %tmp2 +} + + +; Another set of tests for when the intrinsic is used. + +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) + +define <2 x float> @fmla2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C) + ret <2 x float> %val +} + +define <4 x float> @fmla4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C) + ret <4 x float> %val +} + +define <2 x double> @fmla2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C) + ret <2 x double> %val +} + +define <2 x float> @fmls2xfloat_fused(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %negA = fsub <2 x float> <float -0.0, float -0.0>, %A + %val = call <2 x float> @llvm.fma.v2f32(<2 x float> %negA, <2 x float> %B, <2 x float> %C) + ret <2 x float> %val +} + +define <4 x float> @fmls4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %negA = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %A + %val = call <4 x float> @llvm.fma.v4f32(<4 x float> %negA, <4 x float> %B, <4 x float> %C) + ret <4 x float> %val +} + +define <2 x double> @fmls2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmls {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %negA = fsub <2 x double> <double -0.0, double -0.0>, %A + %val = call <2 x double> @llvm.fma.v2f64(<2 x double> %negA, <2 x double> %B, <2 x double> %C) + ret <2 x double> %val +} + +declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) +declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) +declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) + +define <2 x float> @fmuladd2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %val = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C) + ret <2 x float> %val +} + +define <4 x float> @fmuladd4xfloat_fused(<4 x float> %A, <4 x float> %B, <4 x float> %C) { +;CHECK: fmla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %val = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C) + ret <4 x float> %val +} + +define <2 x double> @fmuladd2xdouble_fused(<2 x double> %A, <2 x double> %B, <2 x double> %C) { +;CHECK: fmla {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %val = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C) + ret <2 x double> %val +} diff --git a/test/CodeGen/AArch64/neon-frsqrt-frecp.ll b/test/CodeGen/AArch64/neon-frsqrt-frecp.ll new file mode 100644 index 0000000000..46fe25d74d --- /dev/null +++ b/test/CodeGen/AArch64/neon-frsqrt-frecp.ll @@ -0,0 +1,54 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +; Set of tests for when the intrinsic is used. + +declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @frsqrts_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frsqrts v0.2s, v0.2s, v1.2s + %val = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %lhs, <2 x float> %rhs) + ret <2 x float> %val +} + +define <4 x float> @frsqrts_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frsqrts v0.4s, v0.4s, v1.4s + %val = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %lhs, <4 x float> %rhs) + ret <4 x float> %val +} + +define <2 x double> @frsqrts_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frsqrts v0.2d, v0.2d, v1.2d + %val = call <2 x double> @llvm.arm.neon.vrsqrts.v2f64(<2 x double> %lhs, <2 x double> %rhs) + ret <2 x double> %val +} + +declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @frecps_from_intr_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frecps v0.2s, v0.2s, v1.2s + %val = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %lhs, <2 x float> %rhs) + ret <2 x float> %val +} + +define <4 x float> @frecps_from_intr_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frecps v0.4s, v0.4s, v1.4s + %val = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %lhs, <4 x float> %rhs) + ret <4 x float> %val +} + +define <2 x double> @frecps_from_intr_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: frecps v0.2d, v0.2d, v1.2d + %val = call <2 x double> @llvm.arm.neon.vrecps.v2f64(<2 x double> %lhs, <2 x double> %rhs) + ret <2 x double> %val +} + diff --git a/test/CodeGen/AArch64/neon-halving-add-sub.ll b/test/CodeGen/AArch64/neon-halving-add-sub.ll new file mode 100644 index 0000000000..a8f59dbdb0 --- /dev/null +++ b/test/CodeGen/AArch64/neon-halving-add-sub.ll @@ -0,0 +1,207 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uhadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uhadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_shadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_shadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: shadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uhadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uhadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_shadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_shadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: shadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uhadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uhadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_shadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_shadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: shadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uhadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uhadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_shadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_shadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: shadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uhadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uhadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_shadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_shadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: shadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uhadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uhadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_shadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_shadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: shadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + + +declare <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uhsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uhsub_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uhsub v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_shsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_shsub_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vhsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: shsub v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uhsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uhsub_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uhsub v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_shsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_shsub_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vhsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: shsub v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uhsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uhsub_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uhsub v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_shsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_shsub_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vhsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: shsub v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uhsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uhsub_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uhsub v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_shsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_shsub_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vhsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: shsub v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uhsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uhsub_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uhsub v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_shsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_shsub_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vhsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: shsub v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uhsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uhsub_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uhsub v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_shsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_shsub_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vhsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: shsub v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-max-min-pairwise.ll b/test/CodeGen/AArch64/neon-max-min-pairwise.ll new file mode 100644 index 0000000000..d757aca86a --- /dev/null +++ b/test/CodeGen/AArch64/neon-max-min-pairwise.ll @@ -0,0 +1,310 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_smaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_smaxp_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: smaxp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_umaxp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { + %tmp1 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: umaxp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_smaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_smaxp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: smaxp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_umaxp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_umaxp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: umaxp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_smaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_smaxp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: smaxp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_umaxp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_umaxp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: umaxp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + + +declare <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_smaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_smaxp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: smaxp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_umaxp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_umaxp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: umaxp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + + +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_smaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_smaxp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: smaxp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_umaxp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_umaxp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: umaxp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_smaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_smaxp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: smaxp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_umaxp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_umaxp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: umaxp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_sminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_sminp_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vpmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sminp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_uminp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { + %tmp1 = call <8 x i8> @llvm.arm.neon.vpminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uminp v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_sminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sminp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sminp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_uminp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uminp_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vpminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uminp v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_sminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sminp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sminp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_uminp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uminp_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uminp v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + + +declare <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_sminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sminp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sminp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_uminp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uminp_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vpminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uminp v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + + +declare <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_sminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sminp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sminp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_uminp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uminp_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uminp v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_sminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sminp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sminp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_uminp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uminp_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vpminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uminp v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmaxp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmaxp_v2f32: + %val = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmaxp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmaxp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmaxp_v4f32: + %val = call <4 x float> @llvm.arm.neon.vpmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmaxp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmaxp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmaxp_v2f64: + %val = call <2 x double> @llvm.arm.neon.vpmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmaxp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fminp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fminp_v2f32: + %val = call <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fminp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fminp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fminp_v4f32: + %val = call <4 x float> @llvm.arm.neon.vpmins.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fminp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fminp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fminp_v2f64: + %val = call <2 x double> @llvm.arm.neon.vpmins.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fminp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmaxnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmaxnmp_v2f32: + %val = call <2 x float> @llvm.aarch64.neon.vpmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmaxnmp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmaxnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmaxnmp_v4f32: + %val = call <4 x float> @llvm.aarch64.neon.vpmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmaxnmp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmaxnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmaxnmp_v2f64: + %val = call <2 x double> @llvm.aarch64.neon.vpmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmaxnmp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fminnmp_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fminnmp_v2f32: + %val = call <2 x float> @llvm.aarch64.neon.vpminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fminnmp v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fminnmp_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fminnmp_v4f32: + %val = call <4 x float> @llvm.aarch64.neon.vpminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fminnmp v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fminnmp_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fminnmp_v2f64: + %val = call <2 x double> @llvm.aarch64.neon.vpminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fminnmp v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + diff --git a/test/CodeGen/AArch64/neon-max-min.ll b/test/CodeGen/AArch64/neon-max-min.ll new file mode 100644 index 0000000000..7889c77e37 --- /dev/null +++ b/test/CodeGen/AArch64/neon-max-min.ll @@ -0,0 +1,310 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_smax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_smax_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: smax v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_umax_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { + %tmp1 = call <8 x i8> @llvm.arm.neon.vmaxu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: umax v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_smax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_smax_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: smax v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_umax_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_umax_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vmaxu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: umax v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_smax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_smax_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: smax v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_umax_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_umax_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vmaxu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: umax v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + + +declare <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_smax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_smax_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: smax v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_umax_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_umax_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: umax v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + + +declare <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_smax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_smax_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: smax v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_umax_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_umax_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vmaxu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: umax v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_smax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_smax_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: smax v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_umax_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_umax_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vmaxu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: umax v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_smin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; Using registers other than v0, v1 are possible, but would be odd. +; CHECK: test_smin_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: smin v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_umin_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { + %tmp1 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: umin v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_smin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_smin_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: smin v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_umin_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_umin_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: umin v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_smin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_smin_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: smin v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_umin_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_umin_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: umin v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + + +declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_smin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_smin_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: smin v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_umin_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_umin_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: umin v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + + +declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_smin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_smin_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: smin v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_umin_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_umin_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: umin v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_smin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_smin_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: smin v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_umin_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_umin_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: umin v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmax_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmax_v2f32: + %val = call <2 x float> @llvm.arm.neon.vmaxs.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmax v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmax_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmax_v4f32: + %val = call <4 x float> @llvm.arm.neon.vmaxs.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmax v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmax_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmax_v2f64: + %val = call <2 x double> @llvm.arm.neon.vmaxs.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmax v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmin_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmin_v2f32: + %val = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmin v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmin_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmin_v4f32: + %val = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmin v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmin_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmin_v2f64: + %val = call <2 x double> @llvm.arm.neon.vmins.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmin v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + + +declare <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fmaxnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fmaxnm_v2f32: + %val = call <2 x float> @llvm.aarch64.neon.vmaxnm.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fmaxnm v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fmaxnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fmaxnm_v4f32: + %val = call <4 x float> @llvm.aarch64.neon.vmaxnm.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fmaxnm v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fmaxnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fmaxnm_v2f64: + %val = call <2 x double> @llvm.aarch64.neon.vmaxnm.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fmaxnm v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} + +declare <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @test_fminnm_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; CHECK: test_fminnm_v2f32: + %val = call <2 x float> @llvm.aarch64.neon.vminnm.v2f32(<2 x float> %lhs, <2 x float> %rhs) +; CHECK: fminnm v0.2s, v0.2s, v1.2s + ret <2 x float> %val +} + +define <4 x float> @test_fminnm_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; CHECK: test_fminnm_v4f32: + %val = call <4 x float> @llvm.aarch64.neon.vminnm.v4f32(<4 x float> %lhs, <4 x float> %rhs) +; CHECK: fminnm v0.4s, v0.4s, v1.4s + ret <4 x float> %val +} + +define <2 x double> @test_fminnm_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; CHECK: test_fminnm_v2f64: + %val = call <2 x double> @llvm.aarch64.neon.vminnm.v2f64(<2 x double> %lhs, <2 x double> %rhs) +; CHECK: fminnm v0.2d, v0.2d, v1.2d + ret <2 x double> %val +} diff --git a/test/CodeGen/AArch64/neon-mla-mls.ll b/test/CodeGen/AArch64/neon-mla-mls.ll new file mode 100644 index 0000000000..23e9223a8b --- /dev/null +++ b/test/CodeGen/AArch64/neon-mla-mls.ll @@ -0,0 +1,88 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + + +define <8 x i8> @mla8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { +;CHECK: mla {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = mul <8 x i8> %A, %B; + %tmp2 = add <8 x i8> %C, %tmp1; + ret <8 x i8> %tmp2 +} + +define <16 x i8> @mla16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { +;CHECK: mla {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = mul <16 x i8> %A, %B; + %tmp2 = add <16 x i8> %C, %tmp1; + ret <16 x i8> %tmp2 +} + +define <4 x i16> @mla4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { +;CHECK: mla {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp1 = mul <4 x i16> %A, %B; + %tmp2 = add <4 x i16> %C, %tmp1; + ret <4 x i16> %tmp2 +} + +define <8 x i16> @mla8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { +;CHECK: mla {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp1 = mul <8 x i16> %A, %B; + %tmp2 = add <8 x i16> %C, %tmp1; + ret <8 x i16> %tmp2 +} + +define <2 x i32> @mla2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { +;CHECK: mla {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp1 = mul <2 x i32> %A, %B; + %tmp2 = add <2 x i32> %C, %tmp1; + ret <2 x i32> %tmp2 +} + +define <4 x i32> @mla4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +;CHECK: mla {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp1 = mul <4 x i32> %A, %B; + %tmp2 = add <4 x i32> %C, %tmp1; + ret <4 x i32> %tmp2 +} + +define <8 x i8> @mls8xi8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C) { +;CHECK: mls {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp1 = mul <8 x i8> %A, %B; + %tmp2 = sub <8 x i8> %C, %tmp1; + ret <8 x i8> %tmp2 +} + +define <16 x i8> @mls16xi8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) { +;CHECK: mls {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp1 = mul <16 x i8> %A, %B; + %tmp2 = sub <16 x i8> %C, %tmp1; + ret <16 x i8> %tmp2 +} + +define <4 x i16> @mls4xi16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C) { +;CHECK: mls {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp1 = mul <4 x i16> %A, %B; + %tmp2 = sub <4 x i16> %C, %tmp1; + ret <4 x i16> %tmp2 +} + +define <8 x i16> @mls8xi16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C) { +;CHECK: mls {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp1 = mul <8 x i16> %A, %B; + %tmp2 = sub <8 x i16> %C, %tmp1; + ret <8 x i16> %tmp2 +} + +define <2 x i32> @mls2xi32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C) { +;CHECK: mls {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp1 = mul <2 x i32> %A, %B; + %tmp2 = sub <2 x i32> %C, %tmp1; + ret <2 x i32> %tmp2 +} + +define <4 x i32> @mls4xi32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C) { +;CHECK: mls {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp1 = mul <4 x i32> %A, %B; + %tmp2 = sub <4 x i32> %C, %tmp1; + ret <4 x i32> %tmp2 +} + + diff --git a/test/CodeGen/AArch64/neon-mov.ll b/test/CodeGen/AArch64/neon-mov.ll new file mode 100644 index 0000000000..42f6a894da --- /dev/null +++ b/test/CodeGen/AArch64/neon-mov.ll @@ -0,0 +1,205 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +define <8 x i8> @movi8b() { +;CHECK: movi {{v[0-31]+}}.8b, #0x8 + ret <8 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > +} + +define <16 x i8> @movi16b() { +;CHECK: movi {{v[0-31]+}}.16b, #0x8 + ret <16 x i8> < i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8 > +} + +define <2 x i32> @movi2s_lsl0() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff + ret <2 x i32> < i32 255, i32 255 > +} + +define <2 x i32> @movi2s_lsl8() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #8 + ret <2 x i32> < i32 65280, i32 65280 > +} + +define <2 x i32> @movi2s_lsl16() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #16 + ret <2 x i32> < i32 16711680, i32 16711680 > + +} + +define <2 x i32> @movi2s_lsl24() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, lsl #24 + ret <2 x i32> < i32 4278190080, i32 4278190080 > +} + +define <4 x i32> @movi4s_lsl0() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff + ret <4 x i32> < i32 255, i32 255, i32 255, i32 255 > +} + +define <4 x i32> @movi4s_lsl8() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #8 + ret <4 x i32> < i32 65280, i32 65280, i32 65280, i32 65280 > +} + +define <4 x i32> @movi4s_lsl16() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #16 + ret <4 x i32> < i32 16711680, i32 16711680, i32 16711680, i32 16711680 > + +} + +define <4 x i32> @movi4s_lsl24() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, lsl #24 + ret <4 x i32> < i32 4278190080, i32 4278190080, i32 4278190080, i32 4278190080 > +} + +define <4 x i16> @movi4h_lsl0() { +;CHECK: movi {{v[0-31]+}}.4h, #0xff + ret <4 x i16> < i16 255, i16 255, i16 255, i16 255 > +} + +define <4 x i16> @movi4h_lsl8() { +;CHECK: movi {{v[0-31]+}}.4h, #0xff, lsl #8 + ret <4 x i16> < i16 65280, i16 65280, i16 65280, i16 65280 > +} + +define <8 x i16> @movi8h_lsl0() { +;CHECK: movi {{v[0-31]+}}.8h, #0xff + ret <8 x i16> < i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255 > +} + +define <8 x i16> @movi8h_lsl8() { +;CHECK: movi {{v[0-31]+}}.8h, #0xff, lsl #8 + ret <8 x i16> < i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280, i16 65280 > +} + + +define <2 x i32> @mvni2s_lsl0() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10 + ret <2 x i32> < i32 4294967279, i32 4294967279 > +} + +define <2 x i32> @mvni2s_lsl8() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #8 + ret <2 x i32> < i32 4294963199, i32 4294963199 > +} + +define <2 x i32> @mvni2s_lsl16() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #16 + ret <2 x i32> < i32 4293918719, i32 4293918719 > +} + +define <2 x i32> @mvni2s_lsl24() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, lsl #24 + ret <2 x i32> < i32 4026531839, i32 4026531839 > +} + +define <4 x i32> @mvni4s_lsl0() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10 + ret <4 x i32> < i32 4294967279, i32 4294967279, i32 4294967279, i32 4294967279 > +} + +define <4 x i32> @mvni4s_lsl8() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #8 + ret <4 x i32> < i32 4294963199, i32 4294963199, i32 4294963199, i32 4294963199 > +} + +define <4 x i32> @mvni4s_lsl16() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #16 + ret <4 x i32> < i32 4293918719, i32 4293918719, i32 4293918719, i32 4293918719 > + +} + +define <4 x i32> @mvni4s_lsl24() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, lsl #24 + ret <4 x i32> < i32 4026531839, i32 4026531839, i32 4026531839, i32 4026531839 > +} + + +define <4 x i16> @mvni4h_lsl0() { +;CHECK: mvni {{v[0-31]+}}.4h, #0x10 + ret <4 x i16> < i16 65519, i16 65519, i16 65519, i16 65519 > +} + +define <4 x i16> @mvni4h_lsl8() { +;CHECK: mvni {{v[0-31]+}}.4h, #0x10, lsl #8 + ret <4 x i16> < i16 61439, i16 61439, i16 61439, i16 61439 > +} + +define <8 x i16> @mvni8h_lsl0() { +;CHECK: mvni {{v[0-31]+}}.8h, #0x10 + ret <8 x i16> < i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519, i16 65519 > +} + +define <8 x i16> @mvni8h_lsl8() { +;CHECK: mvni {{v[0-31]+}}.8h, #0x10, lsl #8 + ret <8 x i16> < i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439, i16 61439 > +} + + +define <2 x i32> @movi2s_msl8(<2 x i32> %a) { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, msl #8 + ret <2 x i32> < i32 65535, i32 65535 > +} + +define <2 x i32> @movi2s_msl16() { +;CHECK: movi {{v[0-31]+}}.2s, #0xff, msl #16 + ret <2 x i32> < i32 16777215, i32 16777215 > +} + + +define <4 x i32> @movi4s_msl8() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, msl #8 + ret <4 x i32> < i32 65535, i32 65535, i32 65535, i32 65535 > +} + +define <4 x i32> @movi4s_msl16() { +;CHECK: movi {{v[0-31]+}}.4s, #0xff, msl #16 + ret <4 x i32> < i32 16777215, i32 16777215, i32 16777215, i32 16777215 > +} + +define <2 x i32> @mvni2s_msl8() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, msl #8 + ret <2 x i32> < i32 18446744073709547264, i32 18446744073709547264> +} + +define <2 x i32> @mvni2s_msl16() { +;CHECK: mvni {{v[0-31]+}}.2s, #0x10, msl #16 + ret <2 x i32> < i32 18446744073708437504, i32 18446744073708437504> +} + +define <4 x i32> @mvni4s_msl8() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, msl #8 + ret <4 x i32> < i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264, i32 18446744073709547264> +} + +define <4 x i32> @mvni4s_msl16() { +;CHECK: mvni {{v[0-31]+}}.4s, #0x10, msl #16 + ret <4 x i32> < i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504, i32 18446744073708437504> +} + +define <2 x i64> @movi2d() { +;CHECK: movi {{v[0-31]+}}.2d, #0xff0000ff0000ffff + ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 > +} + +define <1 x i64> @movid() { +;CHECK: movi {{d[0-31]+}}, #0xff0000ff0000ffff + ret <1 x i64> < i64 18374687574888349695 > +} + +define <2 x float> @fmov2s() { +;CHECK: fmov {{v[0-31]+}}.2s, #-12.00000000 + ret <2 x float> < float -1.2e1, float -1.2e1> +} + +define <4 x float> @fmov4s() { +;CHECK: fmov {{v[0-31]+}}.4s, #-12.00000000 + ret <4 x float> < float -1.2e1, float -1.2e1, float -1.2e1, float -1.2e1> +} + +define <2 x double> @fmov2d() { +;CHECK: fmov {{v[0-31]+}}.2d, #-12.00000000 + ret <2 x double> < double -1.2e1, double -1.2e1> +} + + diff --git a/test/CodeGen/AArch64/neon-mul-div.ll b/test/CodeGen/AArch64/neon-mul-div.ll new file mode 100644 index 0000000000..e1be313266 --- /dev/null +++ b/test/CodeGen/AArch64/neon-mul-div.ll @@ -0,0 +1,181 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + + +define <8 x i8> @mul8xi8(<8 x i8> %A, <8 x i8> %B) { +;CHECK: mul {{v[0-31]+}}.8b, {{v[0-31]+}}.8b, {{v[0-31]+}}.8b + %tmp3 = mul <8 x i8> %A, %B; + ret <8 x i8> %tmp3 +} + +define <16 x i8> @mul16xi8(<16 x i8> %A, <16 x i8> %B) { +;CHECK: mul {{v[0-31]+}}.16b, {{v[0-31]+}}.16b, {{v[0-31]+}}.16b + %tmp3 = mul <16 x i8> %A, %B; + ret <16 x i8> %tmp3 +} + +define <4 x i16> @mul4xi16(<4 x i16> %A, <4 x i16> %B) { +;CHECK: mul {{v[0-31]+}}.4h, {{v[0-31]+}}.4h, {{v[0-31]+}}.4h + %tmp3 = mul <4 x i16> %A, %B; + ret <4 x i16> %tmp3 +} + +define <8 x i16> @mul8xi16(<8 x i16> %A, <8 x i16> %B) { +;CHECK: mul {{v[0-31]+}}.8h, {{v[0-31]+}}.8h, {{v[0-31]+}}.8h + %tmp3 = mul <8 x i16> %A, %B; + ret <8 x i16> %tmp3 +} + +define <2 x i32> @mul2xi32(<2 x i32> %A, <2 x i32> %B) { +;CHECK: mul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = mul <2 x i32> %A, %B; + ret <2 x i32> %tmp3 +} + +define <4 x i32> @mul4x32(<4 x i32> %A, <4 x i32> %B) { +;CHECK: mul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = mul <4 x i32> %A, %B; + ret <4 x i32> %tmp3 +} + + define <2 x float> @mul2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fmul {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = fmul <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @mul4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fmul {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = fmul <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @mul2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fmul {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = fmul <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + + + define <2 x float> @div2xfloat(<2 x float> %A, <2 x float> %B) { +;CHECK: fdiv {{v[0-31]+}}.2s, {{v[0-31]+}}.2s, {{v[0-31]+}}.2s + %tmp3 = fdiv <2 x float> %A, %B; + ret <2 x float> %tmp3 +} + +define <4 x float> @div4xfloat(<4 x float> %A, <4 x float> %B) { +;CHECK: fdiv {{v[0-31]+}}.4s, {{v[0-31]+}}.4s, {{v[0-31]+}}.4s + %tmp3 = fdiv <4 x float> %A, %B; + ret <4 x float> %tmp3 +} +define <2 x double> @div2xdouble(<2 x double> %A, <2 x double> %B) { +;CHECK: fdiv {{v[0-31]+}}.2d, {{v[0-31]+}}.2d, {{v[0-31]+}}.2d + %tmp3 = fdiv <2 x double> %A, %B; + ret <2 x double> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) +declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) + +define <8 x i8> @poly_mulv8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: poly_mulv8i8: + %prod = call <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: pmul v0.8b, v0.8b, v1.8b + ret <8 x i8> %prod +} + +define <16 x i8> @poly_mulv16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: poly_mulv16i8: + %prod = call <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: pmul v0.16b, v0.16b, v1.16b + ret <16 x i8> %prod +} + +declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i16> @test_sqdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqdmulh_v4i16: + %prod = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqdmulh v0.4h, v0.4h, v1.4h + ret <4 x i16> %prod +} + +define <8 x i16> @test_sqdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqdmulh_v8i16: + %prod = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqdmulh v0.8h, v0.8h, v1.8h + ret <8 x i16> %prod +} + +define <2 x i32> @test_sqdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqdmulh_v2i32: + %prod = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqdmulh v0.2s, v0.2s, v1.2s + ret <2 x i32> %prod +} + +define <4 x i32> @test_sqdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqdmulh_v4i32: + %prod = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqdmulh v0.4s, v0.4s, v1.4s + ret <4 x i32> %prod +} + +declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) +declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) +declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) +declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i16> @test_sqrdmulh_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqrdmulh_v4i16: + %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqrdmulh v0.4h, v0.4h, v1.4h + ret <4 x i16> %prod +} + +define <8 x i16> @test_sqrdmulh_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqrdmulh_v8i16: + %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqrdmulh v0.8h, v0.8h, v1.8h + ret <8 x i16> %prod +} + +define <2 x i32> @test_sqrdmulh_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqrdmulh_v2i32: + %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqrdmulh v0.2s, v0.2s, v1.2s + ret <2 x i32> %prod +} + +define <4 x i32> @test_sqrdmulh_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqrdmulh_v4i32: + %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqrdmulh v0.4s, v0.4s, v1.4s + ret <4 x i32> %prod +} + +declare <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float>, <2 x float>) +declare <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>) + +define <2 x float> @fmulx_v2f32(<2 x float> %lhs, <2 x float> %rhs) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: fmulx v0.2s, v0.2s, v1.2s + %val = call <2 x float> @llvm.aarch64.neon.vmulx.v2f32(<2 x float> %lhs, <2 x float> %rhs) + ret <2 x float> %val +} + +define <4 x float> @fmulx_v4f32(<4 x float> %lhs, <4 x float> %rhs) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: fmulx v0.4s, v0.4s, v1.4s + %val = call <4 x float> @llvm.aarch64.neon.vmulx.v4f32(<4 x float> %lhs, <4 x float> %rhs) + ret <4 x float> %val +} + +define <2 x double> @fmulx_v2f64(<2 x double> %lhs, <2 x double> %rhs) { +; Using registers other than v0, v1 and v2 are possible, but would be odd. +; CHECK: fmulx v0.2d, v0.2d, v1.2d + %val = call <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double> %lhs, <2 x double> %rhs) + ret <2 x double> %val +} diff --git a/test/CodeGen/AArch64/neon-rounding-halving-add.ll b/test/CodeGen/AArch64/neon-rounding-halving-add.ll new file mode 100644 index 0000000000..009da3b51a --- /dev/null +++ b/test/CodeGen/AArch64/neon-rounding-halving-add.ll @@ -0,0 +1,105 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_urhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_urhadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: urhadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_srhadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_srhadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: srhadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_urhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_urhadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: urhadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_srhadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_srhadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: srhadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_urhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_urhadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: urhadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_srhadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_srhadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: srhadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_urhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_urhadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: urhadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_srhadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_srhadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: srhadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_urhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_urhadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: urhadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_srhadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_srhadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: srhadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_urhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_urhadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: urhadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_srhadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_srhadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: srhadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + + diff --git a/test/CodeGen/AArch64/neon-rounding-shift.ll b/test/CodeGen/AArch64/neon-rounding-shift.ll new file mode 100644 index 0000000000..404e49185e --- /dev/null +++ b/test/CodeGen/AArch64/neon-rounding-shift.ll @@ -0,0 +1,138 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_urshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_urshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: urshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_srshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_srshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: srshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_urshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_urshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: urshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_srshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_srshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: srshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_urshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_urshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: urshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_srshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_srshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: srshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_urshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_urshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: urshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_srshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_srshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: srshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_urshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_urshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: urshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_srshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_srshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: srshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_urshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_urshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: urshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_srshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_srshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: srshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_urshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_urshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: urshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_srshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_srshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: srshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_urshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_urshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: urshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_srshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_srshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: srshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-saturating-add-sub.ll b/test/CodeGen/AArch64/neon-saturating-add-sub.ll new file mode 100644 index 0000000000..b2fac1fbc1 --- /dev/null +++ b/test/CodeGen/AArch64/neon-saturating-add-sub.ll @@ -0,0 +1,274 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uqadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqadd_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqadd_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqadds.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sqadd v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uqadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqaddu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uqadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sqadd_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sqadd_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqadds.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sqadd v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uqadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqaddu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uqadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sqadd_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqadd_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqadd v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uqadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqaddu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uqadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sqadd_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqadd_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqadd v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uqadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqaddu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uqadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sqadd_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqadd_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqadd v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uqadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqaddu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uqadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sqadd_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqadd_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqadd v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqadd_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqaddu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: uqadd d0, d0, d1 + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqadd_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqadd_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqadds.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: sqadd d0, d0, d1 + ret <1 x i64> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_uqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_uqadd_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqaddu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: uqadd v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sqadd_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sqadd_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sqadd v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +declare <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqsub_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uqsub v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqsub_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqsub_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqsubs.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sqsub v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uqsub_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uqsub v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sqsub_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sqsub_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqsubs.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sqsub v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uqsub_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uqsub v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sqsub_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqsub_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqsub v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uqsub_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uqsub v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sqsub_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqsub_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqsub v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uqsub_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uqsub v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sqsub_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqsub_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqsub v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uqsub_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uqsub v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sqsub_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqsub_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqsub v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_uqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_uqsub_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: uqsub v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sqsub_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sqsub_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sqsub v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqsub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqsub_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: uqsub d0, d0, d1 + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqsub_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqsub_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqsubs.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: sqsub d0, d0, d1 + ret <1 x i64> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll new file mode 100644 index 0000000000..05d8dfea9d --- /dev/null +++ b/test/CodeGen/AArch64/neon-saturating-rounding-shift.ll @@ -0,0 +1,138 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqrshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uqrshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqrshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqrshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sqrshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uqrshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uqrshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sqrshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sqrshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sqrshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uqrshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uqrshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sqrshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqrshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqrshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uqrshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uqrshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sqrshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqrshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqrshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uqrshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uqrshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sqrshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqrshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqrshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uqrshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uqrshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sqrshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqrshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqrshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqrshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: uqrshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqrshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqrshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: sqrshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_uqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_uqrshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: uqrshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sqrshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sqrshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sqrshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-saturating-shift.ll b/test/CodeGen/AArch64/neon-saturating-shift.ll new file mode 100644 index 0000000000..3b7f78cc79 --- /dev/null +++ b/test/CodeGen/AArch64/neon-saturating-shift.ll @@ -0,0 +1,138 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: uqshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vqshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sqshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_uqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_uqshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: uqshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sqshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sqshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vqshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sqshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_uqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_uqshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: uqshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sqshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sqshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vqshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sqshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_uqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_uqshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: uqshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sqshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sqshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vqshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sqshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_uqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_uqshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: uqshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sqshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sqshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vqshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sqshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_uqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_uqshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: uqshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sqshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sqshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vqshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sqshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_uqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_uqshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: uqshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sqshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sqshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vqshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: sqshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_uqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_uqshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: uqshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sqshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sqshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vqshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sqshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + diff --git a/test/CodeGen/AArch64/neon-shift.ll b/test/CodeGen/AArch64/neon-shift.ll new file mode 100644 index 0000000000..45a2605799 --- /dev/null +++ b/test/CodeGen/AArch64/neon-shift.ll @@ -0,0 +1,140 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s + +declare <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>) +declare <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8>, <8 x i8>) + +define <8 x i8> @test_uqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_uqshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: ushl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +define <8 x i8> @test_sqshl_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) { +; CHECK: test_sqshl_v8i8: + %tmp1 = call <8 x i8> @llvm.arm.neon.vshifts.v8i8(<8 x i8> %lhs, <8 x i8> %rhs) +; CHECK: sshl v0.8b, v0.8b, v1.8b + ret <8 x i8> %tmp1 +} + +declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>) +declare <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_ushl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_ushl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: ushl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +define <16 x i8> @test_sshl_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) { +; CHECK: test_sshl_v16i8: + %tmp1 = call <16 x i8> @llvm.arm.neon.vshifts.v16i8(<16 x i8> %lhs, <16 x i8> %rhs) +; CHECK: sshl v0.16b, v0.16b, v1.16b + ret <16 x i8> %tmp1 +} + +declare <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16>, <4 x i16>) +declare <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16>, <4 x i16>) + +define <4 x i16> @test_ushl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_ushl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vshiftu.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: ushl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +define <4 x i16> @test_sshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { +; CHECK: test_sshl_v4i16: + %tmp1 = call <4 x i16> @llvm.arm.neon.vshifts.v4i16(<4 x i16> %lhs, <4 x i16> %rhs) +; CHECK: sshl v0.4h, v0.4h, v1.4h + ret <4 x i16> %tmp1 +} + +declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>) +declare <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_ushl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_ushl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: ushl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +define <8 x i16> @test_sshl_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) { +; CHECK: test_sshl_v8i16: + %tmp1 = call <8 x i16> @llvm.arm.neon.vshifts.v8i16(<8 x i16> %lhs, <8 x i16> %rhs) +; CHECK: sshl v0.8h, v0.8h, v1.8h + ret <8 x i16> %tmp1 +} + +declare <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32>, <2 x i32>) +declare <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32>, <2 x i32>) + +define <2 x i32> @test_ushl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_ushl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vshiftu.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: ushl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +define <2 x i32> @test_sshl_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { +; CHECK: test_sshl_v2i32: + %tmp1 = call <2 x i32> @llvm.arm.neon.vshifts.v2i32(<2 x i32> %lhs, <2 x i32> %rhs) +; CHECK: sshl v0.2s, v0.2s, v1.2s + ret <2 x i32> %tmp1 +} + +declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>) +declare <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32>, <4 x i32>) + +define <4 x i32> @test_ushl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_ushl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: ushl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +define <4 x i32> @test_sshl_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { +; CHECK: test_sshl_v4i32: + %tmp1 = call <4 x i32> @llvm.arm.neon.vshifts.v4i32(<4 x i32> %lhs, <4 x i32> %rhs) +; CHECK: sshl v0.4s, v0.4s, v1.4s + ret <4 x i32> %tmp1 +} + +declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>) +declare <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64>, <1 x i64>) + +define <1 x i64> @test_ushl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_ushl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: ushl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +define <1 x i64> @test_sshl_v1i64(<1 x i64> %lhs, <1 x i64> %rhs) { +; CHECK: test_sshl_v1i64: + %tmp1 = call <1 x i64> @llvm.arm.neon.vshifts.v1i64(<1 x i64> %lhs, <1 x i64> %rhs) +; CHECK: sshl d0, d0, d1 + ret <1 x i64> %tmp1 +} + +declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>) +declare <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64>, <2 x i64>) + +define <2 x i64> @test_ushl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_ushl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: ushl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + +define <2 x i64> @test_sshl_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) { +; CHECK: test_sshl_v2i64: + %tmp1 = call <2 x i64> @llvm.arm.neon.vshifts.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) +; CHECK: sshl v0.2d, v0.2d, v1.2d + ret <2 x i64> %tmp1 +} + + + |