summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorElena Demikhovsky <elena.demikhovsky@intel.com>2012-10-24 14:46:16 +0000
committerElena Demikhovsky <elena.demikhovsky@intel.com>2012-10-24 14:46:16 +0000
commit3575222175b4982f380ff291bb17be67aadc0966 (patch)
treea0055d4266babdb3572eb09191dff2b8f499dfc6
parent0ed9b8e0bf065470fcffb8c2ced3d794b376356d (diff)
downloadllvm-3575222175b4982f380ff291bb17be67aadc0966.tar.gz
llvm-3575222175b4982f380ff291bb17be67aadc0966.tar.bz2
llvm-3575222175b4982f380ff291bb17be67aadc0966.tar.xz
Special calling conventions for Intel OpenCL built-in library.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166566 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/CallingConv.h6
-rw-r--r--lib/AsmParser/LLLexer.cpp1
-rw-r--r--lib/AsmParser/LLParser.cpp2
-rw-r--r--lib/AsmParser/LLToken.h1
-rw-r--r--lib/Target/X86/X86CallingConv.td57
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp21
-rw-r--r--lib/VMCore/AsmWriter.cpp1
-rw-r--r--lib/VMCore/Verifier.cpp1
-rw-r--r--test/CodeGen/X86/avx-intel-ocl.ll107
-rw-r--r--test/CodeGen/X86/sse-intel-ocl.ll93
10 files changed, 289 insertions, 1 deletions
diff --git a/include/llvm/CallingConv.h b/include/llvm/CallingConv.h
index 86e4eebb82..053f4eb326 100644
--- a/include/llvm/CallingConv.h
+++ b/include/llvm/CallingConv.h
@@ -112,7 +112,11 @@ namespace CallingConv {
/// Cannot have variable arguments.
/// Can also be called by the host.
/// Is externally visible.
- SPIR_KERNEL = 76
+ SPIR_KERNEL = 76,
+
+ /// Intel_OCL_BI - Calling conventions for Intel OpenCL built-ins
+ Intel_OCL_BI = 77
+
};
} // End CallingConv namespace
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 464dfd51d6..91f973d8d3 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -527,6 +527,7 @@ lltok::Kind LLLexer::LexIdentifier() {
KEYWORD(ptx_device);
KEYWORD(spir_kernel);
KEYWORD(spir_func);
+ KEYWORD(intel_ocl_bicc);
KEYWORD(cc);
KEYWORD(c);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index cc7cc31246..75fc16cd95 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -1094,6 +1094,7 @@ bool LLParser::ParseOptionalVisibility(unsigned &Res) {
/// ::= /*empty*/
/// ::= 'ccc'
/// ::= 'fastcc'
+/// ::= 'kw_intel_ocl_bicc'
/// ::= 'coldcc'
/// ::= 'x86_stdcallcc'
/// ::= 'x86_fastcallcc'
@@ -1125,6 +1126,7 @@ bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
case lltok::kw_ptx_device: CC = CallingConv::PTX_Device; break;
case lltok::kw_spir_kernel: CC = CallingConv::SPIR_KERNEL; break;
case lltok::kw_spir_func: CC = CallingConv::SPIR_FUNC; break;
+ case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
case lltok::kw_cc: {
unsigned ArbitraryCC;
Lex.Lex();
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index ff6d68f0da..6cffc52d17 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -77,6 +77,7 @@ namespace lltok {
kw_c,
kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
+ kw_intel_ocl_bicc,
kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc,
kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc,
kw_msp430_intrcc,
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index c881b4d0ad..6786756c7f 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -88,6 +88,21 @@ def RetCC_X86_32_Fast : CallingConv<[
CCDelegateTo<RetCC_X86Common>
]>;
+// Intel_OCL_BI return-value convention.
+def RetCC_Intel_OCL_BI : CallingConv<[
+ // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+ CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+ // 256-bit FP vectors
+ // No more than 4 registers
+ CCIfType<[v8f32, v4f64, v8i32, v4i64],
+ CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+ // i32, i64 in the standard way
+ CCDelegateTo<RetCC_X86Common>
+]>;
+
// X86-64 C return-value convention.
def RetCC_X86_64_C : CallingConv<[
// The X86-64 calling convention always returns FP values in XMM0.
@@ -128,6 +143,10 @@ def RetCC_X86_64 : CallingConv<[
// This is the return-value convention used for the entire X86 backend.
def RetCC_X86 : CallingConv<[
+
+ // Check if this is the Intel OpenCL built-ins calling convention
+ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+
CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
CCDelegateTo<RetCC_X86_32>
]>;
@@ -235,6 +254,29 @@ def CC_X86_Win64_C : CallingConv<[
CCIfType<[f80], CCAssignToStack<0, 0>>
]>;
+// X86-64 Intel OpenCL built-ins calling convention.
+def CC_Intel_OCL_BI : CallingConv<[
+ CCIfType<[i32], CCIfSubtarget<"isTargetWin32()", CCAssignToStack<4, 4>>>,
+
+ CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
+ CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8, R9 ]>>>,
+
+ CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX]>>,
+ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX]>>,
+
+ // The SSE vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+
+ // The 256-bit vector arguments are passed in YMM registers.
+ CCIfType<[v8f32, v4f64, v8i32, v4i64],
+ CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+
+ CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+ CCDelegateTo<CC_X86_64_C>
+]>;
+
+
def CC_X86_64_GHC : CallingConv<[
// Promote i8/i16/i32 arguments to i64.
CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
@@ -408,6 +450,7 @@ def CC_X86_64 : CallingConv<[
// This is the argument convention used for the entire X86 backend.
def CC_X86 : CallingConv<[
+ CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
CCDelegateTo<CC_X86_32>
]>;
@@ -426,3 +469,17 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
(sequence "XMM%u", 6, 15))>;
+
+
+// Standard C + YMM6-15
+def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
+ R13, R14, R15,
+ (sequence "YMM%u", 6, 15))>;
+
+//Standard C + XMM 8-15
+def CSR_64_Intel_OCL_BI : CalleeSavedRegs<(add CSR_64,
+ (sequence "XMM%u", 8, 15))>;
+
+//Standard C + YMM 8-15
+def CSR_64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add CSR_64,
+ (sequence "YMM%u", 8, 15))>;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 4bcf6b1f19..73ac747742 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -229,15 +229,26 @@ const uint16_t *
X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
bool callsEHReturn = false;
bool ghcCall = false;
+ bool oclBiCall = false;
+ bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
if (MF) {
callsEHReturn = MF->getMMI().callsEHReturn();
const Function *F = MF->getFunction();
ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
+ oclBiCall = (F ? F->getCallingConv() == CallingConv::Intel_OCL_BI : false);
}
if (ghcCall)
return CSR_NoRegs_SaveList;
+ if (oclBiCall) {
+ if (HasAVX && IsWin64)
+ return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+ if (HasAVX && Is64Bit)
+ return CSR_64_Intel_OCL_BI_AVX_SaveList;
+ if (!HasAVX && !IsWin64 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_SaveList;
+ }
if (Is64Bit) {
if (IsWin64)
return CSR_Win64_SaveList;
@@ -252,6 +263,16 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
const uint32_t*
X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+ bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+
+ if (CC == CallingConv::Intel_OCL_BI) {
+ if (IsWin64 && HasAVX)
+ return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
+ if (Is64Bit && HasAVX)
+ return CSR_64_Intel_OCL_BI_AVX_RegMask;
+ if (!HasAVX && !IsWin64 && Is64Bit)
+ return CSR_64_Intel_OCL_BI_RegMask;
+ }
if (CC == CallingConv::GHC)
return CSR_NoRegs_RegMask;
if (!Is64Bit)
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index 5e23e6fc78..b72c17f667 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -74,6 +74,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out)
case CallingConv::X86_StdCall: Out << "x86_stdcallcc"; break;
case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break;
case CallingConv::X86_ThisCall: Out << "x86_thiscallcc"; break;
+ case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break;
case CallingConv::ARM_APCS: Out << "arm_apcscc"; break;
case CallingConv::ARM_AAPCS: Out << "arm_aapcscc"; break;
case CallingConv::ARM_AAPCS_VFP:Out << "arm_aapcs_vfpcc"; break;
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index fd629b485a..eb40b09d29 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -705,6 +705,7 @@ void Verifier::visitFunction(Function &F) {
case CallingConv::Cold:
case CallingConv::X86_FastCall:
case CallingConv::X86_ThisCall:
+ case CallingConv::Intel_OCL_BI:
case CallingConv::PTX_Kernel:
case CallingConv::PTX_Device:
Assert1(!F.isVarArg(),
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
new file mode 100644
index 0000000000..f0c219fb98
--- /dev/null
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+avx | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+avx | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: vaddps {{.*}}, {{%ymm[0-1]}}
+; WIN64: vaddps {{.*}}, {{%ymm[0-1]}}
+; WIN64: leaq {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl %eax, (%esp)
+; WIN32: vaddps {{.*}}, {{%ymm[0-1]}}
+; WIN32: vaddps {{.*}}, {{%ymm[0-1]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: vaddps {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: vaddps {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: leaq {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+ %y = alloca <16 x float>, align 16
+ %x = fadd <16 x float> %a, %b
+ %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
+ %2 = load <16 x float>* %y, align 16
+ %3 = fadd <16 x float> %2, %1
+ ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved ymm6-ymm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: vaddps {{%ymm[6-7]}}, %ymm0, %ymm0
+; WIN64: vaddps {{%ymm[6-7]}}, %ymm1, %ymm1
+; WIN64: ret
+
+; preserved ymm8-ymm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: vaddps {{%ymm[8-9]}}, %ymm0, %ymm0
+; NOT_WIN: vaddps {{%ymm[8-9]}}, %ymm1, %ymm1
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+ %y = alloca <16 x float>, align 16
+ %x = fadd <16 x float> %a, %b
+ %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
+ %2 = load <16 x float>* %y, align 16
+ %3 = fadd <16 x float> %1, %b
+ %4 = fadd <16 x float> %2, %3
+ ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}} # 32-byte Spill
+; WIN64: call
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}} # 32-byte Reload
+
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp) ## 32-byte Spill
+; NOT_WIN: call
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+ %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+ ret <16 x float> %c
+} \ No newline at end of file
diff --git a/test/CodeGen/X86/sse-intel-ocl.ll b/test/CodeGen/X86/sse-intel-ocl.ll
new file mode 100644
index 0000000000..188505072f
--- /dev/null
+++ b/test/CodeGen/X86/sse-intel-ocl.ll
@@ -0,0 +1,93 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: addps {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps {{.*}}, {{%xmm[0-3]}}
+; WIN64: leaq {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl %eax, (%esp)
+; WIN32: addps {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps {{.*}}, {{%xmm[0-3]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: addps {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: leaq {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+ %y = alloca <16 x float>, align 16
+ %x = fadd <16 x float> %a, %b
+ %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
+ %2 = load <16 x float>* %y, align 16
+ %3 = fadd <16 x float> %2, %1
+ ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved xmm6-xmm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: addps {{%xmm[6-9]}}, {{.*}}
+; WIN64: addps {{%xmm[6-9]}}, {{.*}}
+; WIN64: ret
+
+; preserved xmm8-xmm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: addps {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+ %y = alloca <16 x float>, align 16
+ %x = fadd <16 x float> %a, %b
+ %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y)
+ %2 = load <16 x float>* %y, align 16
+ %3 = fadd <16 x float> %1, %b
+ %4 = fadd <16 x float> %2, %3
+ ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}} ## 16-byte Spill
+; NOT_WIN: call
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}} ## 16-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+ %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+ ret <16 x float> %c
+}