diff options
author | Abdoulaye Walsimou Gaye <awg@embtoolkit.org> | 2013-06-12 21:27:20 +0200 |
---|---|---|
committer | Abdoulaye Walsimou Gaye <awg@embtoolkit.org> | 2013-06-12 21:27:20 +0200 |
commit | 46ef71a74329a0777e3464c65927cd3e59c928d6 (patch) | |
tree | e8c06aeb70dcbe18acdf6db1f88a1452ac1dca68 | |
parent | 32c493313c30abe0830e95dc92c7ad1c2241ba57 (diff) | |
parent | ce337502f5a88500df9ab2f59ab48f97be0b4395 (diff) | |
download | llvm-46ef71a74329a0777e3464c65927cd3e59c928d6.tar.gz llvm-46ef71a74329a0777e3464c65927cd3e59c928d6.tar.bz2 llvm-46ef71a74329a0777e3464c65927cd3e59c928d6.tar.xz |
Merge branch 'release_33' of git://github.com/llvm-mirror/llvm into embtk-support-release-3.3
66 files changed, 1554 insertions, 1540 deletions
diff --git a/autoconf/configure.ac b/autoconf/configure.ac index c1efd31a62..ffd155d2e2 100644 --- a/autoconf/configure.ac +++ b/autoconf/configure.ac @@ -31,7 +31,7 @@ dnl=== dnl===-----------------------------------------------------------------------=== dnl Initialize autoconf and define the package name, version number and dnl address for reporting bugs. -AC_INIT([LLVM],[3.3svn],[http://llvm.org/bugs/]) +AC_INIT([LLVM],[3.3],[http://llvm.org/bugs/]) AC_DEFINE([LLVM_VERSION_MAJOR], [3], [Major version of the LLVM API]) AC_DEFINE([LLVM_VERSION_MINOR], [3], [Minor version of the LLVM API]) @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.60 for LLVM 3.3svn. +# Generated by GNU Autoconf 2.60 for LLVM 3.3. # # Report bugs to <http://llvm.org/bugs/>. # @@ -561,8 +561,8 @@ SHELL=${CONFIG_SHELL-/bin/sh} # Identity of this package. PACKAGE_NAME='LLVM' PACKAGE_TARNAME='llvm' -PACKAGE_VERSION='3.3svn' -PACKAGE_STRING='LLVM 3.3svn' +PACKAGE_VERSION='3.3' +PACKAGE_STRING='LLVM 3.3' PACKAGE_BUGREPORT='http://llvm.org/bugs/' ac_unique_file="lib/IR/Module.cpp" @@ -1331,7 +1331,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures LLVM 3.3svn to adapt to many kinds of systems. +\`configure' configures LLVM 3.3 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1397,7 +1397,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of LLVM 3.3svn:";; + short | recursive ) echo "Configuration of LLVM 3.3:";; esac cat <<\_ACEOF @@ -1569,7 +1569,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -LLVM configure 3.3svn +LLVM configure 3.3 generated by GNU Autoconf 2.60 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001, @@ -1585,7 +1585,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by LLVM $as_me 3.3svn, which was +It was created by LLVM $as_me 3.3, which was generated by GNU Autoconf 2.60. Invocation command line was $ $0 $@ @@ -23128,7 +23128,7 @@ exec 6>&1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by LLVM $as_me 3.3svn, which was +This file was extended by LLVM $as_me 3.3, which was generated by GNU Autoconf 2.60. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -23181,7 +23181,7 @@ Report bugs to <bug-autoconf@gnu.org>." _ACEOF cat >>$CONFIG_STATUS <<_ACEOF ac_cs_version="\\ -LLVM config.status 3.3svn +LLVM config.status 3.3 configured by $0, generated by GNU Autoconf 2.60, with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\" diff --git a/docs/CommandLine.rst b/docs/CommandLine.rst index 263a025f69..9b77a98908 100644 --- a/docs/CommandLine.rst +++ b/docs/CommandLine.rst @@ -618,6 +618,8 @@ would yield the help output: -help - display available options (-help-hidden for more) -o <filename> - Specify output filename +.. _grouping options into categories: + Grouping options into categories -------------------------------- diff --git a/docs/LangRef.rst b/docs/LangRef.rst index 410f640776..7743ff06a0 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -2868,11 +2868,10 @@ All globals of this sort should have a section specified as The '``llvm.used``' Global Variable ----------------------------------- -The ``@llvm.used`` global is an array which has - :ref:`appending linkage <linkage_appending>`. This array contains a list of -pointers to global variables, functions and aliases which may optionally have a -pointer cast formed of bitcast or getelementptr. For example, a legal -use of it is: +The ``@llvm.used`` global is an array which has :ref:`appending linkage +<linkage_appending>`. This array contains a list of pointers to global +variables, functions and aliases which may optionally have a pointer cast formed +of bitcast or getelementptr. For example, a legal use of it is: .. code-block:: llvm diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst index 7952cd5423..73b0abf628 100644 --- a/docs/ReleaseNotes.rst +++ b/docs/ReleaseNotes.rst @@ -5,12 +5,6 @@ LLVM 3.3 Release Notes .. contents:: :local: -.. warning:: - These are in-progress notes for the upcoming LLVM 3.3 release. You may - prefer the `LLVM 3.2 Release Notes <http://llvm.org/releases/3.2/docs - /ReleaseNotes.html>`_. - - Introduction ============ @@ -34,13 +28,6 @@ page <http://llvm.org/releases/>`_. Non-comprehensive list of changes in this release ================================================= -.. NOTE - For small 1-3 sentence descriptions, just add an entry at the end of - this list. If your description won't fit comfortably in one bullet - point (e.g. maybe you would like to give an example of the - functionality, or simply have a lot to talk about), see the `NOTE` below - for adding a new subsection. - * The CellSPU port has been removed. It can still be found in older versions. * The IR-level extended linker APIs (for example, to link bitcode files out of @@ -70,17 +57,15 @@ Non-comprehensive list of changes in this release examples of the new syntax. The old syntax using register classes still works, but it will be removed in a future LLVM release. -* ... next change ... - -.. NOTE - If you would like to document a larger change, then you can add a - subsection about it right here. You can copy the following boilerplate - and un-indent it (the indentation causes it to be inside this comment). +* MCJIT now supports exception handling. Support for it in the old jit will be + removed in the 3.4 release. - Special New Feature - ------------------- +* Command line options can now be grouped into categories which are shown in + the output of ``-help``. See :ref:`grouping options into categories`. - Makes programs 10x faster by doing Special New Thing. +* The appearance of command line options in ``-help`` that are inherited by + linking with libraries that use the LLVM Command line support library can now + be modified at runtime. See :ref:`cl::getRegisteredOptions`. AArch64 target -------------- @@ -99,9 +84,59 @@ GNU-style thread local storage and inline assembly. Hexagon Target -------------- -- Removed support for legacy hexagonv2 and hexagonv3 processor - architectures which are no longer in use. Currently supported - architectures are hexagonv4 and hexagonv5. +Removed support for legacy hexagonv2 and hexagonv3 processor architectures which +are no longer in use. Currently supported architectures are hexagonv4 and +hexagonv5. + +Mips target +-------------- + +New features and improvements: + +- Clang driver + - Support for Sourcery CodeBench Mips toolchain directories tree. + - Support for new command line options including: + - -mxgot/-mno-xgot + - -EL / -EB + - -mmicromips / -mno-micromips + - -msingle-float / -mdouble-float + - -mabi=32 (o32 abi) and -mabi=64 (n64 abi) + - Previously, options such as -mips16, -mmicromips, -mdsp and -mdspr2 were + not passed to the assembler. This issue has been fixed. + +- A number of changes have been made to improve the quality of DSP-ASE code + generation. + - Multiply and multiply-accumulate instructions can now use all four + accumulators. + - Instruction selection patterns have been added so that DSP instructions + are emitted without having to use builtins. + +- Delay slot filler pass can now search successor blocks for instructions to + fill delay slots (use option -disable-mips-df-succbb-search=false). + +PowerPC Target +-------------- + +New features and improvements: + +- PowerPC now supports an assembly parser. +- Support added for thread-local storage. 64-bit ELF subtarget only. +- Support added for medium and large code model (-mcmodel=medium,large). + Medium code model is now the default. 64-bit ELF subtarget only. +- Improved register allocation (fewer reserved registers). +- 64-bit atomic load and store are now supported. +- Improved code generation for unaligned memory accesses of scalar types. +- Improved performance of floating-point divide and square root + with -ffast-math. +- Support for predicated returns. +- Improved code generation for comparisons. +- Support added for inline setjmp and longjmp. +- Support added for many instructions introduced in PowerISA 2.04, 2.05, + and 2.06. +- Improved spill code for vector registers. +- Support added for -mno-altivec. +- ABI compatibility fixes for complex parameters, 128-bit integer parameters, + and varargs functions. 64-bit ELF subtarget only. Loop Vectorizer --------------- @@ -126,16 +161,16 @@ SLP Vectorizer -------------- LLVM now has a new SLP vectorizer. The new SLP vectorizer is not enabled by -default but can be enabled using the clang flag -fslp-vectorize. The BB-vectorizer -can also be enabled using the command line flag -fslp-vectorize-aggressive. +default but can be enabled using the clang flag ``-fslp-vectorize``. The +BB-vectorizer can also be enabled using the command line flag +``-fslp-vectorize-aggressive``. R600 Backend ------------ -The R600 backend was added in this release, it supports AMD GPUs -(HD2XXX - HD7XXX). This backend is used in AMD's Open Source -graphics / compute drivers which are developed as part of the `Mesa3D -<http://www.mesa3d.org>`_ project. +The R600 backend was added in this release, it supports AMD GPUs (HD2XXX - +HD7XXX). This backend is used in AMD's Open Source graphics / compute drivers +which are developed as part of the `Mesa3D <http://www.mesa3d.org>`_ project. SystemZ/s390x Backend --------------------- @@ -145,41 +180,130 @@ is restricted to GNU/Linux (GNU triplet s390x-linux-gnu) and requires z10 or greater. +Sub-project Status Update +========================= + +In addition to the core LLVM 3.3 distribution of production-quality compiler +infrastructure, the LLVM project includes sub-projects that use the LLVM core +and share the same distribution license. This section provides updates on these +sub-projects. + + +DragonEgg: GCC front-ends, LLVM back-end +---------------------------------------- + +`DragonEgg <http://dragonegg.llvm.org/>`_ is a +`GCC plugin <http://gcc.gnu.org/wiki/plugins>`_ that replaces GCC's optimizers +and code generators with LLVM's. It works with gcc-4.5, 4.6, 4.7 and 4.8, can +target the x86-32/x86-64 and ARM processor families, and has been successfully +used on the Darwin, FreeBSD, KFreeBSD, Linux and OpenBSD platforms. It fully +supports Ada, C, C++ and Fortran. It has partial support for Go, Java, Obj-C +and Obj-C++. Note that gcc-4.6 is the best supported version, and that Ada in +particular doesn't work well with gcc-4.7 and newer. + +The `3.3 release <http://llvm.org/apt/>`_ has the following notable changes. + +- supports gcc-4.8 (requires gcc-4.8.1 or newer) +- object files can be written directly using LLVM's integrated assembler +- produces saner debug info +- bitfields can now contain arbitrary scalar types (useful for Ada) + + +LLDB: Low Level Debugger +------------------------ + +`LLDB <http://lldb.llvm.org/>`_ is a ground-up implementation of a command-line +debugger, as well as a debugger API that can be used from scripts and other +applications. LLDB uses the following components of the LLVM core distribution +to support the latest language features and target support: + +- the Clang parser for high-quality parsing of C, C++ and Objective C +- the LLVM disassembler +- the LLVM JIT compiler (MCJIT) for expression evaluation + +The `3.3 release <http://lldb.llvm.org/download.html>`_ has the following notable changes. + +Features now supported on Linux: + +- Debugging multi-threaded programs +- Support for watchpoints +- Process list, attach and fork +- `vim integration <http://llvm.org/svn/llvm-project/lldb/branches/release_33/utils/vim-lldb/README>`_ for LLDB + +Portability: + +- Builds with cmake, ninja, auto-tools, clang 3.3 and gcc 4.6 + +Linux Improvements: + +- Improved register support including vector registers +- Basic debugging of i386 programs +- Bug fixes for expression evaluation + + External Open Source Projects Using LLVM 3.3 ============================================ -An exciting aspect of LLVM is that it is used as an enabling technology for -a lot of other language and tools projects. This section lists some of the +An exciting aspect of LLVM is that it is used as an enabling technology for a +lot of other language and tools projects. This section lists some of the projects that have already been updated to work with LLVM 3.3. Portable Computing Language (pocl) ---------------------------------- -In addition to producing an easily portable open source OpenCL -implementation, another major goal of `pocl <http://pocl.sourceforge.net/>`_ -is improving performance portability of OpenCL programs with -compiler optimizations, reducing the need for target-dependent manual -optimizations. An important part of pocl is a set of LLVM passes used to -statically parallelize multiple work-items with the kernel compiler, even in -the presence of work-group barriers. This enables static parallelization of -the fine-grained static concurrency in the work groups in multiple ways. +In addition to producing an easily portable open source OpenCL implementation, +another major goal of `pocl <http://pocl.sourceforge.net/>`_ is improving +performance portability of OpenCL programs with compiler optimizations, reducing +the need for target-dependent manual optimizations. An important part of pocl is +a set of LLVM passes used to statically parallelize multiple work-items with the +kernel compiler, even in the presence of work-group barriers. This enables +static parallelization of the fine-grained static concurrency in the work groups +in multiple ways. TTA-based Co-design Environment (TCE) ------------------------------------- -`TCE <http://tce.cs.tut.fi/>`_ is a toolset for designing new -processors based on the Transport triggered architecture (TTA). -The toolset provides a complete co-design flow from C/C++ -programs down to synthesizable VHDL/Verilog and parallel program binaries. -Processor customization points include the register files, function units, -supported operations, and the interconnection network. +`TCE <http://tce.cs.tut.fi/>`_ is a toolset for designing new processors based +on the Transport triggered architecture (TTA). The toolset provides a complete +co-design flow from C/C++ programs down to synthesizable VHDL/Verilog and +parallel program binaries. Processor customization points include the register +files, function units, supported operations, and the interconnection network. TCE uses Clang and LLVM for C/C++/OpenCL C language support, target independent -optimizations and also for parts of code generation. It generates new -LLVM-based code generators "on the fly" for the designed TTA processors and -loads them in to the compiler backend as runtime libraries to avoid -per-target recompilation of larger parts of the compiler chain. +optimizations and also for parts of code generation. It generates new LLVM-based +code generators "on the fly" for the designed TTA processors and loads them in +to the compiler backend as runtime libraries to avoid per-target recompilation +of larger parts of the compiler chain. + +Just-in-time Adaptive Decoder Engine (Jade) +------------------------------------------- + +`Jade <https://github.com/orcc/jade>`_ (Just-in-time Adaptive Decoder Engine) is +a generic video decoder engine using LLVM for just-in-time compilation of video +decoder configurations. Those configurations are designed by MPEG Reconfigurable +Video Coding (RVC) committee. MPEG RVC standard is built on a stream-based +dataflow representation of decoders. It is composed of a standard library of +coding tools written in RVC-CAL language and a dataflow configuration --- block +diagram --- of a decoder. + +Jade project is hosted as part of the Open RVC-CAL Compiler (`Orcc +<http://orcc.sf.net>`_) and requires it to translate the RVC-CAL standard +library of video coding tools into an LLVM assembly code. + +LDC - the LLVM-based D compiler +------------------------------- + +`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It +pragmatically combines efficiency, control, and modeling power, with safety and +programmer productivity. D supports powerful concepts like Compile-Time Function +Execution (CTFE) and Template Meta-Programming, provides an innovative approach +to concurrency and offers many classical paradigms. + +`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler +combined with LLVM as backend to produce efficient native code. LDC targets +x86/x86_64 systems like Linux, OS X and Windows and also Linux/PPC64. Ports to +other architectures like ARM are underway. Additional Information diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h index 306549fba4..349447fbbb 100644 --- a/include/llvm/Analysis/ScalarEvolution.h +++ b/include/llvm/Analysis/ScalarEvolution.h @@ -453,7 +453,8 @@ namespace llvm { ExitLimit ComputeExitLimitFromCond(const Loop *L, Value *ExitCond, BasicBlock *TBB, - BasicBlock *FBB); + BasicBlock *FBB, + bool IsSubExpr); /// ComputeExitLimitFromICmp - Compute the number of times the backedge of /// the specified loop will execute if its exit condition were a conditional @@ -461,7 +462,8 @@ namespace llvm { ExitLimit ComputeExitLimitFromICmp(const Loop *L, ICmpInst *ExitCond, BasicBlock *TBB, - BasicBlock *FBB); + BasicBlock *FBB, + bool IsSubExpr); /// ComputeLoadConstantCompareExitLimit - Given an exit condition /// of 'icmp op load X, cst', try to see if we can compute the @@ -483,7 +485,7 @@ namespace llvm { /// HowFarToZero - Return the number of times an exit condition comparing /// the specified value to zero will execute. If not computable, return /// CouldNotCompute. - ExitLimit HowFarToZero(const SCEV *V, const Loop *L); + ExitLimit HowFarToZero(const SCEV *V, const Loop *L, bool IsSubExpr); /// HowFarToNonZero - Return the number of times an exit condition checking /// the specified value for nonzero will execute. If not computable, return @@ -495,7 +497,7 @@ namespace llvm { /// computable, return CouldNotCompute. isSigned specifies whether the /// less-than is signed. ExitLimit HowManyLessThans(const SCEV *LHS, const SCEV *RHS, - const Loop *L, bool isSigned); + const Loop *L, bool isSigned, bool IsSubExpr); /// getPredecessorWithUniqueSuccessorForBB - Return a predecessor of BB /// (which may not be an immediate predecessor) which has exactly one diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td index ebfd03e484..c248517def 100644 --- a/include/llvm/IR/IntrinsicsNVVM.td +++ b/include/llvm/IR/IntrinsicsNVVM.td @@ -405,6 +405,8 @@ def llvm_anyi64ptr_ty : LLVMAnyPointerType<llvm_i64_ty>; // (space)i64* // Sqrt // + def int_nvvm_sqrt_f : GCCBuiltin<"__nvvm_sqrt_f">, + Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; def int_nvvm_sqrt_rn_ftz_f : GCCBuiltin<"__nvvm_sqrt_rn_ftz_f">, Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; def int_nvvm_sqrt_rn_f : GCCBuiltin<"__nvvm_sqrt_rn_f">, diff --git a/include/llvm/MC/MCELFObjectWriter.h b/include/llvm/MC/MCELFObjectWriter.h index a59776d5cd..65dd1e8998 100644 --- a/include/llvm/MC/MCELFObjectWriter.h +++ b/include/llvm/MC/MCELFObjectWriter.h @@ -45,7 +45,14 @@ struct ELFRelocationEntry { // Support lexicographic sorting. bool operator<(const ELFRelocationEntry &RE) const { - return RE.r_offset < r_offset; + if (RE.r_offset != r_offset) + return RE.r_offset < r_offset; + if (Type != RE.Type) + return Type < RE.Type; + if (Index != RE.Index) + return Index < RE.Index; + llvm_unreachable("ELFRelocs might be unstable!"); + return 0; } }; diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 6ea915fdb0..f876748af3 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -3937,10 +3937,19 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) { /// before taking the branch. For loops with multiple exits, it may not be the /// number times that the loop header executes because the loop may exit /// prematurely via another branch. +/// +/// FIXME: We conservatively call getBackedgeTakenCount(L) instead of +/// getExitCount(L, ExitingBlock) to compute a safe trip count considering all +/// loop exits. getExitCount() may return an exact count for this branch +/// assuming no-signed-wrap. The number of well-defined iterations may actually +/// be higher than this trip count if this exit test is skipped and the loop +/// exits via a different branch. Ideally, getExitCount() would know whether it +/// depends on a NSW assumption, and we would only fall back to a conservative +/// trip count in that case. unsigned ScalarEvolution:: -getSmallConstantTripCount(Loop *L, BasicBlock *ExitingBlock) { +getSmallConstantTripCount(Loop *L, BasicBlock */*ExitingBlock*/) { const SCEVConstant *ExitCount = - dyn_cast<SCEVConstant>(getExitCount(L, ExitingBlock)); + dyn_cast<SCEVConstant>(getBackedgeTakenCount(L)); if (!ExitCount) return 0; @@ -3967,8 +3976,8 @@ getSmallConstantTripCount(Loop *L, BasicBlock *ExitingBlock) { /// As explained in the comments for getSmallConstantTripCount, this assumes /// that control exits the loop via ExitingBlock. unsigned ScalarEvolution:: -getSmallConstantTripMultiple(Loop *L, BasicBlock *ExitingBlock) { - const SCEV *ExitCount = getExitCount(L, ExitingBlock); +getSmallConstantTripMultiple(Loop *L, BasicBlock */*ExitingBlock*/) { + const SCEV *ExitCount = getBackedgeTakenCount(L); if (ExitCount == getCouldNotCompute()) return 1; @@ -3997,7 +4006,7 @@ getSmallConstantTripMultiple(Loop *L, BasicBlock *ExitingBlock) { } // getExitCount - Get the expression for the number of loop iterations for which -// this loop is guaranteed not to exit via ExitintBlock. Otherwise return +// this loop is guaranteed not to exit via ExitingBlock. Otherwise return // SCEVCouldNotCompute. const SCEV *ScalarEvolution::getExitCount(Loop *L, BasicBlock *ExitingBlock) { return getBackedgeTakenInfo(L).getExact(ExitingBlock, this); @@ -4382,26 +4391,36 @@ ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) { // Proceed to the next level to examine the exit condition expression. return ComputeExitLimitFromCond(L, ExitBr->getCondition(), ExitBr->getSuccessor(0), - ExitBr->getSuccessor(1)); + ExitBr->getSuccessor(1), + /*IsSubExpr=*/false); } /// ComputeExitLimitFromCond - Compute the number of times the /// backedge of the specified loop will execute if its exit condition /// were a conditional branch of ExitCond, TBB, and FBB. +/// +/// @param IsSubExpr is true if ExitCond does not directly control the exit +/// branch. In this case, we cannot assume that the loop only exits when the +/// condition is true and cannot infer that failing to meet the condition prior +/// to integer wraparound results in undefined behavior. ScalarEvolution::ExitLimit ScalarEvolution::ComputeExitLimitFromCond(const Loop *L, Value *ExitCond, BasicBlock *TBB, - BasicBlock *FBB) { + BasicBlock *FBB, + bool IsSubExpr) { // Check if the controlling expression for this loop is an And or Or. if (BinaryOperator *BO = dyn_cast<BinaryOperator>(ExitCond)) { if (BO->getOpcode() == Instruction::And) { // Recurse on the operands of the and. - ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB); - ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB); + bool EitherMayExit = L->contains(TBB); + ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB, + IsSubExpr || EitherMayExit); + ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB, + IsSubExpr || EitherMayExit); const SCEV *BECount = getCouldNotCompute(); const SCEV *MaxBECount = getCouldNotCompute(); - if (L->contains(TBB)) { + if (EitherMayExit) { // Both conditions must be true for the loop to continue executing. // Choose the less conservative count. if (EL0.Exact == getCouldNotCompute() || @@ -4429,11 +4448,14 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L, } if (BO->getOpcode() == Instruction::Or) { // Recurse on the operands of the or. - ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB); - ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB); + bool EitherMayExit = L->contains(FBB); + ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB, + IsSubExpr || EitherMayExit); + ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB, + IsSubExpr || EitherMayExit); const SCEV *BECount = getCouldNotCompute(); const SCEV *MaxBECount = getCouldNotCompute(); - if (L->contains(FBB)) { + if (EitherMayExit) { // Both conditions must be false for the loop to continue executing. // Choose the less conservative count. if (EL0.Exact == getCouldNotCompute() || @@ -4464,7 +4486,7 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L, // With an icmp, it may be feasible to compute an exact backedge-taken count. // Proceed to the next level to examine the icmp. if (ICmpInst *ExitCondICmp = dyn_cast<ICmpInst>(ExitCond)) - return ComputeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB); + return ComputeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, IsSubExpr); // Check for a constant condition. These are normally stripped out by // SimplifyCFG, but ScalarEvolution may be used by a pass which wishes to @@ -4490,7 +4512,8 @@ ScalarEvolution::ExitLimit ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L, ICmpInst *ExitCond, BasicBlock *TBB, - BasicBlock *FBB) { + BasicBlock *FBB, + bool IsSubExpr) { // If the condition was exit on true, convert the condition to exit on false ICmpInst::Predicate Cond; @@ -4542,7 +4565,7 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L, switch (Cond) { case ICmpInst::ICMP_NE: { // while (X != Y) // Convert to: while (X-Y != 0) - ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L); + ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L, IsSubExpr); if (EL.hasAnyInfo()) return EL; break; } @@ -4553,24 +4576,24 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L, break; } case ICmpInst::ICMP_SLT: { - ExitLimit EL = HowManyLessThans(LHS, RHS, L, true); + ExitLimit EL = HowManyLessThans(LHS, RHS, L, true, IsSubExpr); if (EL.hasAnyInfo()) return EL; break; } case ICmpInst::ICMP_SGT: { ExitLimit EL = HowManyLessThans(getNotSCEV(LHS), - getNotSCEV(RHS), L, true); + getNotSCEV(RHS), L, true, IsSubExpr); if (EL.hasAnyInfo()) return EL; break; } case ICmpInst::ICMP_ULT: { - ExitLimit EL = HowManyLessThans(LHS, RHS, L, false); + ExitLimit EL = HowManyLessThans(LHS, RHS, L, false, IsSubExpr); if (EL.hasAnyInfo()) return EL; break; } case ICmpInst::ICMP_UGT: { ExitLimit EL = HowManyLessThans(getNotSCEV(LHS), - getNotSCEV(RHS), L, false); + getNotSCEV(RHS), L, false, IsSubExpr); if (EL.hasAnyInfo()) return EL; break; } @@ -5439,7 +5462,7 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) { /// effectively V != 0. We know and take advantage of the fact that this /// expression only being used in a comparison by zero context. ScalarEvolution::ExitLimit -ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) { +ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool IsSubExpr) { // If the value is a constant if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) { // If the value is already zero, the branch will execute zero times. @@ -5537,19 +5560,20 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) { } // If the recurrence is known not to wraparound, unsigned divide computes the - // back edge count. We know that the value will either become zero (and thus - // the loop terminates), that the loop will terminate through some other exit - // condition first, or that the loop has undefined behavior. This means - // we can't "miss" the exit value, even with nonunit stride. + // back edge count. (Ideally we would have an "isexact" bit for udiv). We know + // that the value will either become zero (and thus the loop terminates), that + // the loop will terminate through some other exit condition first, or that + // the loop has undefined behavior. This means we can't "miss" the exit + // value, even with nonunit stride. // - // FIXME: Prove that loops always exhibits *acceptable* undefined - // behavior. Loops must exhibit defined behavior until a wrapped value is - // actually used. So the trip count computed by udiv could be smaller than the - // number of well-defined iterations. - if (AddRec->getNoWrapFlags(SCEV::FlagNW)) { - // FIXME: We really want an "isexact" bit for udiv. + // This is only valid for expressions that directly compute the loop exit. It + // is invalid for subexpressions in which the loop may exit through this + // branch even if this subexpression is false. In that case, the trip count + // computed by this udiv could be smaller than the number of well-defined + // iterations. + if (!IsSubExpr && AddRec->getNoWrapFlags(SCEV::FlagNW)) return getUDivExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step); - } + // Then, try to solve the above equation provided that Start is constant. if (const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start)) return SolveLinEquationWithOverflow(StepC->getValue()->getValue(), @@ -6315,9 +6339,14 @@ const SCEV *ScalarEvolution::getBECount(const SCEV *Start, /// HowManyLessThans - Return the number of times a backedge containing the /// specified less-than comparison will execute. If not computable, return /// CouldNotCompute. +/// +/// @param IsSubExpr is true when the LHS < RHS condition does not directly +/// control the branch. In this case, we can only compute an iteration count for +/// a subexpression that cannot overflow before evaluating true. ScalarEvolution::ExitLimit ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS, - const Loop *L, bool isSigned) { + const Loop *L, bool isSigned, + bool IsSubExpr) { // Only handle: "ADDREC < LoopInvariant". if (!isLoopInvariant(RHS, L)) return getCouldNotCompute(); @@ -6326,10 +6355,12 @@ ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS, return getCouldNotCompute(); // Check to see if we have a flag which makes analysis easy. - bool NoWrap = isSigned ? - AddRec->getNoWrapFlags((SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNW)) : - AddRec->getNoWrapFlags((SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNW)); - + bool NoWrap = false; + if (!IsSubExpr) { + NoWrap = AddRec->getNoWrapFlags( + (SCEV::NoWrapFlags)(((isSigned ? SCEV::FlagNSW : SCEV::FlagNUW)) + | SCEV::FlagNW)); + } if (AddRec->isAffine()) { unsigned BitWidth = getTypeSizeInBits(AddRec->getType()); const SCEV *Step = AddRec->getStepRecurrence(*this); diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index aeaa63f2af..73bba6989f 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -708,6 +708,12 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) { Asm->OutStreamer.getContext().setMCLineTableSymbol(LineTableStartSym, NewCU->getUniqueID()); + // Use a single line table if we are using .loc and generating assembly. + bool UseTheFirstCU = + (Asm->TM.hasMCUseLoc() && + Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer) || + (NewCU->getUniqueID() == 0); + // DW_AT_stmt_list is a offset of line number information for this // compile unit in debug_line section. For split dwarf this is // left in the skeleton CU and so not included. @@ -716,9 +722,9 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) { if (!useSplitDwarf()) { if (Asm->MAI->doesDwarfUseRelocationsAcrossSections()) NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, - NewCU->getUniqueID() == 0 ? + UseTheFirstCU ? Asm->GetTempSymbol("section_line") : LineTableStartSym); - else if (NewCU->getUniqueID() == 0) + else if (UseTheFirstCU) NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0); else NewCU->addDelta(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, @@ -1441,7 +1447,12 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) { LexicalScope *FnScope = LScopes.getCurrentFunctionScope(); CompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode()); assert(TheCU && "Unable to find compile unit!"); - Asm->OutStreamer.getContext().setDwarfCompileUnitID(TheCU->getUniqueID()); + if (Asm->TM.hasMCUseLoc() && + Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer) + // Use a single line table if we are using .loc and generating assembly. + Asm->OutStreamer.getContext().setDwarfCompileUnitID(0); + else + Asm->OutStreamer.getContext().setDwarfCompileUnitID(TheCU->getUniqueID()); FunctionBeginSym = Asm->GetTempSymbol("func_begin", Asm->getFunctionNumber()); diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp index 07f0ccf52f..d894f664dc 100644 --- a/lib/CodeGen/IntrinsicLowering.cpp +++ b/lib/CodeGen/IntrinsicLowering.cpp @@ -453,6 +453,12 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) { CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 1)); break; + case Intrinsic::annotation: + case Intrinsic::ptr_annotation: + // Just drop the annotation, but forward the value + CI->replaceAllUsesWith(CI->getOperand(0)); + break; + case Intrinsic::var_annotation: break; // Strip out annotate intrinsic diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 2ded723ca0..67db211ec4 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5034,6 +5034,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { setValue(&I, Res); return 0; } + case Intrinsic::annotation: + case Intrinsic::ptr_annotation: + // Drop the intrinsic, but forward the value + setValue(&I, getValue(I.getOperand(0))); + return 0; case Intrinsic::var_annotation: // Discard annotate attributes return 0; diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp index 2368e9e86a..edefdb4c36 100644 --- a/lib/MC/MCParser/AsmParser.cpp +++ b/lib/MC/MCParser/AsmParser.cpp @@ -1092,7 +1092,7 @@ bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res, MCBinaryExpr::Opcode Dummy; unsigned NextTokPrec = getBinOpPrecedence(Lexer.getKind(), Dummy); if (TokPrec < NextTokPrec) { - if (ParseBinOpRHS(Precedence+1, RHS, EndLoc)) return true; + if (ParseBinOpRHS(TokPrec+1, RHS, EndLoc)) return true; } // Merge LHS and RHS according to operator. diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp index 4839c3470c..8f1895e048 100644 --- a/lib/MC/MCStreamer.cpp +++ b/lib/MC/MCStreamer.cpp @@ -35,6 +35,7 @@ MCStreamer::~MCStreamer() { void MCStreamer::reset() { for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i) delete W64UnwindInfos[i]; + W64UnwindInfos.clear(); EmitEHFrame = true; EmitDebugFrame = false; CurrentW64UnwindInfo = 0; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 0f7beb1e3b..e49cfc4985 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -5257,6 +5257,23 @@ static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) { return false; } +static EVT getExtensionTo64Bits(const EVT &OrigVT) { + if (OrigVT.getSizeInBits() >= 64) + return OrigVT; + + assert(OrigVT.isSimple() && "Expecting a simple value type"); + + MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy; + switch (OrigSimpleTy) { + default: llvm_unreachable("Unexpected Vector Type"); + case MVT::v2i8: + case MVT::v2i16: + return MVT::v2i32; + case MVT::v4i8: + return MVT::v4i16; + } +} + /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL. /// We insert the required extension here to get the vector to fill a D register. @@ -5272,18 +5289,8 @@ static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, return N; // Must extend size to at least 64 bits to be used as an operand for VMULL. - MVT::SimpleValueType OrigSimpleTy = OrigTy.getSimpleVT().SimpleTy; - EVT NewVT; - switch (OrigSimpleTy) { - default: llvm_unreachable("Unexpected Orig Vector Type"); - case MVT::v2i8: - case MVT::v2i16: - NewVT = MVT::v2i32; - break; - case MVT::v4i8: - NewVT = MVT::v4i16; - break; - } + EVT NewVT = getExtensionTo64Bits(OrigTy); + return DAG.getNode(ExtOpcode, N->getDebugLoc(), NewVT, N); } @@ -5293,22 +5300,22 @@ static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, /// reach a total size of 64 bits. We have to add the extension separately /// because ARM does not have a sign/zero extending load for vectors. static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) { - SDValue NonExtendingLoad = - DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(), + EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT()); + + // The load already has the right type. + if (ExtendedTy == LD->getMemoryVT()) + return DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(), LD->getAlignment()); - unsigned ExtOp = 0; - switch (LD->getExtensionType()) { - default: llvm_unreachable("Unexpected LoadExtType"); - case ISD::EXTLOAD: - case ISD::SEXTLOAD: ExtOp = ISD::SIGN_EXTEND; break; - case ISD::ZEXTLOAD: ExtOp = ISD::ZERO_EXTEND; break; - } - MVT::SimpleValueType MemType = LD->getMemoryVT().getSimpleVT().SimpleTy; - MVT::SimpleValueType ExtType = LD->getValueType(0).getSimpleVT().SimpleTy; - return AddRequiredExtensionForVMULL(NonExtendingLoad, DAG, - MemType, ExtType, ExtOp); + + // We need to create a zextload/sextload. We cannot just create a load + // followed by a zext/zext node because LowerMUL is also run during normal + // operation legalization where we can't create illegal types. + return DAG.getExtLoad(LD->getExtensionType(), LD->getDebugLoc(), ExtendedTy, + LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(), + LD->getMemoryVT(), LD->isVolatile(), + LD->isNonTemporal(), LD->getAlignment()); } /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt index 7da2fed4cd..735ca9b6b7 100644 --- a/lib/Target/NVPTX/CMakeLists.txt +++ b/lib/Target/NVPTX/CMakeLists.txt @@ -23,6 +23,7 @@ set(NVPTXCodeGen_sources NVPTXAsmPrinter.cpp NVPTXUtilities.cpp NVVMReflect.cpp + NVPTXGenericToNVVM.cpp ) add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources}) diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h index 6a53a443bf..072c65da35 100644 --- a/lib/Target/NVPTX/NVPTX.h +++ b/lib/Target/NVPTX/NVPTX.h @@ -16,6 +16,7 @@ #define LLVM_TARGET_NVPTX_H #include "MCTargetDesc/NVPTXBaseInfo.h" +#include "llvm/ADT/StringMap.h" #include "llvm/IR/Module.h" #include "llvm/IR/Value.h" #include "llvm/Support/ErrorHandling.h" @@ -62,6 +63,9 @@ createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel); FunctionPass *createLowerStructArgsPass(NVPTXTargetMachine &); FunctionPass *createNVPTXReMatPass(NVPTXTargetMachine &); FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &); +ModulePass *createGenericToNVVMPass(); +ModulePass *createNVVMReflectPass(); +ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping); bool isImageOrSamplerVal(const Value *, const Module *); diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp index ce5d78afa3..229e4e5980 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp @@ -68,11 +68,12 @@ InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore, namespace { /// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V /// depends. -void DiscoverDependentGlobals(Value *V, DenseSet<GlobalVariable *> &Globals) { - if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) +void DiscoverDependentGlobals(const Value *V, + DenseSet<const GlobalVariable *> &Globals) { + if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) Globals.insert(GV); else { - if (User *U = dyn_cast<User>(V)) { + if (const User *U = dyn_cast<User>(V)) { for (unsigned i = 0, e = U->getNumOperands(); i != e; ++i) { DiscoverDependentGlobals(U->getOperand(i), Globals); } @@ -84,8 +85,9 @@ void DiscoverDependentGlobals(Value *V, DenseSet<GlobalVariable *> &Globals) { /// instances to be emitted, but only after any dependents have been added /// first. void VisitGlobalVariableForEmission( - GlobalVariable *GV, SmallVectorImpl<GlobalVariable *> &Order, - DenseSet<GlobalVariable *> &Visited, DenseSet<GlobalVariable *> &Visiting) { + const GlobalVariable *GV, SmallVectorImpl<const GlobalVariable *> &Order, + DenseSet<const GlobalVariable *> &Visited, + DenseSet<const GlobalVariable *> &Visiting) { // Have we already visited this one? if (Visited.count(GV)) return; @@ -98,12 +100,12 @@ void VisitGlobalVariableForEmission( Visiting.insert(GV); // Make sure we visit all dependents first - DenseSet<GlobalVariable *> Others; + DenseSet<const GlobalVariable *> Others; for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i) DiscoverDependentGlobals(GV->getOperand(i), Others); - for (DenseSet<GlobalVariable *>::iterator I = Others.begin(), - E = Others.end(); + for (DenseSet<const GlobalVariable *>::iterator I = Others.begin(), + E = Others.end(); I != E; ++I) VisitGlobalVariableForEmission(*I, Order, Visited, Visiting); @@ -405,6 +407,11 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() { SmallString<128> Str; raw_svector_ostream O(Str); + if (!GlobalsEmitted) { + emitGlobals(*MF->getFunction()->getParent()); + GlobalsEmitted = true; + } + // Set up MRI = &MF->getRegInfo(); F = MF->getFunction(); @@ -695,7 +702,7 @@ void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) { else O << ".func "; printReturnValStr(F, O); - O << *CurrentFnSym << "\n"; + O << *Mang->getSymbol(F) << "\n"; emitFunctionParamList(F, O); O << ";\n"; } @@ -795,7 +802,7 @@ static bool useFuncSeen(const Constant *C, return false; } -void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) { +void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) { llvm::DenseMap<const Function *, bool> seenMap; for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) { const Function *F = FI; @@ -805,7 +812,6 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) { continue; if (F->getIntrinsicID()) continue; - CurrentFnSym = Mang->getSymbol(F); emitDeclaration(F, O); continue; } @@ -817,14 +823,12 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) { // The use is in the initialization of a global variable // that is a function pointer, so print a declaration // for the original function - CurrentFnSym = Mang->getSymbol(F); emitDeclaration(F, O); break; } // Emit a declaration of this function if the function that // uses this constant expr has already been seen. if (useFuncSeen(C, seenMap)) { - CurrentFnSym = Mang->getSymbol(F); emitDeclaration(F, O); break; } @@ -844,7 +848,6 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) { // appearing in the module before the callee. so print out // a declaration for the callee. if (seenMap.find(caller) != seenMap.end()) { - CurrentFnSym = Mang->getSymbol(F); emitDeclaration(F, O); break; } @@ -921,6 +924,12 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) { if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA) recordAndEmitFilenames(M); + GlobalsEmitted = false; + + return false; // success +} + +void NVPTXAsmPrinter::emitGlobals(const Module &M) { SmallString<128> Str2; raw_svector_ostream OS2(Str2); @@ -931,13 +940,13 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) { // global variable in order, and ensure that we emit it *after* its dependent // globals. We use a little extra memory maintaining both a set and a list to // have fast searches while maintaining a strict ordering. - SmallVector<GlobalVariable *, 8> Globals; - DenseSet<GlobalVariable *> GVVisited; - DenseSet<GlobalVariable *> GVVisiting; + SmallVector<const GlobalVariable *, 8> Globals; + DenseSet<const GlobalVariable *> GVVisited; + DenseSet<const GlobalVariable *> GVVisiting; // Visit each global variable, in order - for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E; - ++I) + for (Module::const_global_iterator I = M.global_begin(), E = M.global_end(); + I != E; ++I) VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting); assert(GVVisited.size() == M.getGlobalList().size() && @@ -951,7 +960,6 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) { OS2 << '\n'; OutStreamer.EmitRawText(OS2.str()); - return false; // success } void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) { @@ -989,6 +997,14 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) { } bool NVPTXAsmPrinter::doFinalization(Module &M) { + + // If we did not emit any functions, then the global declarations have not + // yet been emitted. + if (!GlobalsEmitted) { + emitGlobals(M); + GlobalsEmitted = true; + } + // XXX Temproarily remove global variables so that doFinalization() will not // emit them again (global variables are emitted at beginning). @@ -1063,7 +1079,8 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V, } } -void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, +void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar, + raw_ostream &O, bool processDemoted) { // Skip meta data @@ -1107,10 +1124,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, if (llvm::isSampler(*GVar)) { O << ".global .samplerref " << llvm::getSamplerName(*GVar); - Constant *Initializer = NULL; + const Constant *Initializer = NULL; if (GVar->hasInitializer()) Initializer = GVar->getInitializer(); - ConstantInt *CI = NULL; + const ConstantInt *CI = NULL; if (Initializer) CI = dyn_cast<ConstantInt>(Initializer); if (CI) { @@ -1183,7 +1200,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, if (localDecls.find(demotedFunc) != localDecls.end()) localDecls[demotedFunc].push_back(GVar); else { - std::vector<GlobalVariable *> temp; + std::vector<const GlobalVariable *> temp; temp.push_back(GVar); localDecls[demotedFunc] = temp; } @@ -1199,7 +1216,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) { O << " ."; - O << getPTXFundamentalTypeStr(ETy, false); + // Special case: ABI requires that we use .u8 for predicates + if (ETy->isIntegerTy(1)) + O << "u8"; + else + O << getPTXFundamentalTypeStr(ETy, false); O << " "; O << *Mang->getSymbol(GVar); @@ -1209,7 +1230,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) || (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) && GVar->hasInitializer()) { - Constant *Initializer = GVar->getInitializer(); + const Constant *Initializer = GVar->getInitializer(); if (!Initializer->isNullValue()) { O << " = "; printScalarConstant(Initializer, O); @@ -1233,7 +1254,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) || (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) && GVar->hasInitializer()) { - Constant *Initializer = GVar->getInitializer(); + const Constant *Initializer = GVar->getInitializer(); if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) { AggBuffer aggBuffer(ElementSize, O, *this); bufferAggregateConstant(Initializer, &aggBuffer); @@ -1283,7 +1304,7 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) { if (localDecls.find(f) == localDecls.end()) return; - std::vector<GlobalVariable *> &gvars = localDecls[f]; + std::vector<const GlobalVariable *> &gvars = localDecls[f]; for (unsigned i = 0, e = gvars.size(); i != e; ++i) { O << "\t// demoted variable\n\t"; @@ -1448,7 +1469,7 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I, int paramIndex, raw_ostream &O) { if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) || (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)) - O << *CurrentFnSym << "_param_" << paramIndex; + O << *Mang->getSymbol(I->getParent()) << "_param_" << paramIndex; else { std::string argName = I->getName(); const char *p = argName.c_str(); @@ -1507,11 +1528,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { if (llvm::isImage(*I)) { std::string sname = I->getName(); if (llvm::isImageWriteOnly(*I)) - O << "\t.param .surfref " << *CurrentFnSym << "_param_" << paramIndex; + O << "\t.param .surfref " << *Mang->getSymbol(F) << "_param_" + << paramIndex; else // Default image is read_only - O << "\t.param .texref " << *CurrentFnSym << "_param_" << paramIndex; + O << "\t.param .texref " << *Mang->getSymbol(F) << "_param_" + << paramIndex; } else // Should be llvm::isSampler(*I) - O << "\t.param .samplerref " << *CurrentFnSym << "_param_" + O << "\t.param .samplerref " << *Mang->getSymbol(F) << "_param_" << paramIndex; continue; } @@ -1564,7 +1587,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) { } // non-pointer scalar to kernel func - O << "\t.param ." << getPTXFundamentalTypeStr(Ty) << " "; + O << "\t.param ."; + // Special case: predicate operands become .u8 types + if (Ty->isIntegerTy(1)) + O << "u8"; + else + O << getPTXFundamentalTypeStr(Ty); + O << " "; printParamName(I, paramIndex, O); continue; } @@ -1751,12 +1780,12 @@ void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) { O << utohexstr(API.getZExtValue()); } -void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) { - if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) { +void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) { + if (const ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) { O << CI->getValue(); return; } - if (ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) { + if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) { printFPConstant(CFP, O); return; } @@ -1764,13 +1793,13 @@ void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) { O << "0"; return; } - if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) { + if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) { O << *Mang->getSymbol(GVar); return; } - if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { - Value *v = Cexpr->stripPointerCasts(); - if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { + if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { + const Value *v = Cexpr->stripPointerCasts(); + if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { O << *Mang->getSymbol(GVar); return; } else { @@ -1781,7 +1810,7 @@ void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) { llvm_unreachable("Not scalar type found in printScalarConstant()"); } -void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes, +void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer) { const DataLayout *TD = TM.getDataLayout(); @@ -1809,13 +1838,13 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes, ptr = (unsigned char *)&int16; aggBuffer->addBytes(ptr, 2, Bytes); } else if (ETy == Type::getInt32Ty(CPV->getContext())) { - if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) { + if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) { int int32 = (int)(constInt->getZExtValue()); ptr = (unsigned char *)&int32; aggBuffer->addBytes(ptr, 4, Bytes); break; - } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { - if (ConstantInt *constInt = dyn_cast<ConstantInt>( + } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { + if (const ConstantInt *constInt = dyn_cast<ConstantInt>( ConstantFoldConstantExpression(Cexpr, TD))) { int int32 = (int)(constInt->getZExtValue()); ptr = (unsigned char *)&int32; @@ -1831,13 +1860,13 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes, } llvm_unreachable("unsupported integer const type"); } else if (ETy == Type::getInt64Ty(CPV->getContext())) { - if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) { + if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) { long long int64 = (long long)(constInt->getZExtValue()); ptr = (unsigned char *)&int64; aggBuffer->addBytes(ptr, 8, Bytes); break; - } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { - if (ConstantInt *constInt = dyn_cast<ConstantInt>( + } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { + if (const ConstantInt *constInt = dyn_cast<ConstantInt>( ConstantFoldConstantExpression(Cexpr, TD))) { long long int64 = (long long)(constInt->getZExtValue()); ptr = (unsigned char *)&int64; @@ -1858,7 +1887,7 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes, } case Type::FloatTyID: case Type::DoubleTyID: { - ConstantFP *CFP = dyn_cast<ConstantFP>(CPV); + const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV); const Type *Ty = CFP->getType(); if (Ty == Type::getFloatTy(CPV->getContext())) { float float32 = (float) CFP->getValueAPF().convertToFloat(); @@ -1874,10 +1903,10 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes, break; } case Type::PointerTyID: { - if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) { + if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) { aggBuffer->addSymbol(GVar); - } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { - Value *v = Cexpr->stripPointerCasts(); + } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) { + const Value *v = Cexpr->stripPointerCasts(); aggBuffer->addSymbol(v); } unsigned int s = TD->getTypeAllocSize(CPV->getType()); @@ -1906,7 +1935,7 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes, } } -void NVPTXAsmPrinter::bufferAggregateConstant(Constant *CPV, +void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV, AggBuffer *aggBuffer) { const DataLayout *TD = TM.getDataLayout(); int Bytes; diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h index 6dc9fc0ffe..7faa6b265b 100644 --- a/lib/Target/NVPTX/NVPTXAsmPrinter.h +++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h @@ -91,7 +91,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { unsigned char *buffer; // the buffer unsigned numSymbols; // number of symbol addresses SmallVector<unsigned, 4> symbolPosInBuffer; - SmallVector<Value *, 4> Symbols; + SmallVector<const Value *, 4> Symbols; private: unsigned curpos; @@ -128,7 +128,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { } return curpos; } - void addSymbol(Value *GVar) { + void addSymbol(const Value *GVar) { symbolPosInBuffer.push_back(curpos); Symbols.push_back(GVar); numSymbols++; @@ -153,11 +153,11 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter { if (pos) O << ", "; if (pos == nextSymbolPos) { - Value *v = Symbols[nSym]; - if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { + const Value *v = Symbols[nSym]; + if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) { MCSymbol *Name = AP.Mang->getSymbol(GVar); O << *Name; - } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) { + } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) { O << *nvptx::LowerConstant(Cexpr, AP); } else llvm_unreachable("symbol type unknown"); @@ -205,10 +205,12 @@ private: void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const; // definition autogenerated. void printInstruction(const MachineInstr *MI, raw_ostream &O); - void printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, bool = false); + void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O, + bool = false); void printParamName(int paramIndex, raw_ostream &O); void printParamName(Function::const_arg_iterator I, int paramIndex, raw_ostream &O); + void emitGlobals(const Module &M); void emitHeader(Module &M, raw_ostream &O); void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const; void emitVirtualRegister(unsigned int vr, bool isVec, raw_ostream &O); @@ -234,6 +236,8 @@ protected: private: std::string CurrentBankselLabelInBasicBlock; + bool GlobalsEmitted; + // This is specific per MachineFunction. const MachineRegisterInfo *MRI; // The contents are specific for each @@ -247,7 +251,7 @@ private: std::map<const Type *, std::string> TypeNameMap; // List of variables demoted to a function scope. - std::map<const Function *, std::vector<GlobalVariable *> > localDecls; + std::map<const Function *, std::vector<const GlobalVariable *> > localDecls; // To record filename to ID mapping std::map<std::string, unsigned> filenameMap; @@ -256,15 +260,15 @@ private: void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O); void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const; std::string getPTXFundamentalTypeStr(const Type *Ty, bool = true) const; - void printScalarConstant(Constant *CPV, raw_ostream &O); + void printScalarConstant(const Constant *CPV, raw_ostream &O); void printFPConstant(const ConstantFP *Fp, raw_ostream &O); - void bufferLEByte(Constant *CPV, int Bytes, AggBuffer *aggBuffer); - void bufferAggregateConstant(Constant *CV, AggBuffer *aggBuffer); + void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer); + void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer); void printOperandProper(const MachineOperand &MO); void emitLinkageDirective(const GlobalValue *V, raw_ostream &O); - void emitDeclarations(Module &, raw_ostream &O); + void emitDeclarations(const Module &, raw_ostream &O); void emitDeclaration(const Function *, raw_ostream &O); static const char *getRegisterName(unsigned RegNo); diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp new file mode 100644 index 0000000000..1077c46fb4 --- /dev/null +++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp @@ -0,0 +1,436 @@ +//===-- GenericToNVVM.cpp - Convert generic module to NVVM module - C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Convert generic global variables into either .global or .const access based +// on the variable's "constant" qualifier. +// +//===----------------------------------------------------------------------===// + +#include "NVPTX.h" +#include "NVPTXUtilities.h" +#include "MCTargetDesc/NVPTXBaseInfo.h" + +#include "llvm/PassManager.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Operator.h" +#include "llvm/ADT/ValueMap.h" +#include "llvm/CodeGen/MachineFunctionAnalysis.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/IRBuilder.h" + +using namespace llvm; + +namespace llvm { +void initializeGenericToNVVMPass(PassRegistry &); +} + +namespace { +class GenericToNVVM : public ModulePass { +public: + static char ID; + + GenericToNVVM() : ModulePass(ID) {} + + virtual bool runOnModule(Module &M); + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + } + +private: + Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV, + IRBuilder<> &Builder); + Value *remapConstant(Module *M, Function *F, Constant *C, + IRBuilder<> &Builder); + Value *remapConstantVectorOrConstantAggregate(Module *M, Function *F, + Constant *C, + IRBuilder<> &Builder); + Value *remapConstantExpr(Module *M, Function *F, ConstantExpr *C, + IRBuilder<> &Builder); + void remapNamedMDNode(Module *M, NamedMDNode *N); + MDNode *remapMDNode(Module *M, MDNode *N); + + typedef ValueMap<GlobalVariable *, GlobalVariable *> GVMapTy; + typedef ValueMap<Constant *, Value *> ConstantToValueMapTy; + GVMapTy GVMap; + ConstantToValueMapTy ConstantToValueMap; +}; +} + +char GenericToNVVM::ID = 0; + +ModulePass *llvm::createGenericToNVVMPass() { return new GenericToNVVM(); } + +INITIALIZE_PASS( + GenericToNVVM, "generic-to-nvvm", + "Ensure that the global variables are in the global address space", false, + false) + +bool GenericToNVVM::runOnModule(Module &M) { + // Create a clone of each global variable that has the default address space. + // The clone is created with the global address space specifier, and the pair + // of original global variable and its clone is placed in the GVMap for later + // use. + + for (Module::global_iterator I = M.global_begin(), E = M.global_end(); + I != E;) { + GlobalVariable *GV = I++; + if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC && + !llvm::isTexture(*GV) && !llvm::isSurface(*GV) && + !GV->getName().startswith("llvm.")) { + GlobalVariable *NewGV = new GlobalVariable( + M, GV->getType()->getElementType(), GV->isConstant(), + GV->getLinkage(), GV->hasInitializer() ? GV->getInitializer() : NULL, + "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL); + NewGV->copyAttributesFrom(GV); + GVMap[GV] = NewGV; + } + } + + // Return immediately, if every global variable has a specific address space + // specifier. + if (GVMap.empty()) { + return false; + } + + // Walk through the instructions in function defitinions, and replace any use + // of original global variables in GVMap with a use of the corresponding + // copies in GVMap. If necessary, promote constants to instructions. + for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) { + if (I->isDeclaration()) { + continue; + } + IRBuilder<> Builder(I->getEntryBlock().getFirstNonPHIOrDbg()); + for (Function::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE; + ++BBI) { + for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE; + ++II) { + for (unsigned i = 0, e = II->getNumOperands(); i < e; ++i) { + Value *Operand = II->getOperand(i); + if (isa<Constant>(Operand)) { + II->setOperand( + i, remapConstant(&M, I, cast<Constant>(Operand), Builder)); + } + } + } + } + ConstantToValueMap.clear(); + } + + // Walk through the metadata section and update the debug information + // associated with the global variables in the default address space. + for (Module::named_metadata_iterator I = M.named_metadata_begin(), + E = M.named_metadata_end(); + I != E; I++) { + remapNamedMDNode(&M, I); + } + + // Walk through the global variable initializers, and replace any use of + // original global variables in GVMap with a use of the corresponding copies + // in GVMap. The copies need to be bitcast to the original global variable + // types, as we cannot use cvta in global variable initializers. + for (GVMapTy::iterator I = GVMap.begin(), E = GVMap.end(); I != E;) { + GlobalVariable *GV = I->first; + GlobalVariable *NewGV = I->second; + ++I; + Constant *BitCastNewGV = ConstantExpr::getBitCast(NewGV, GV->getType()); + // At this point, the remaining uses of GV should be found only in global + // variable initializers, as other uses have been already been removed + // while walking through the instructions in function definitions. + for (Value::use_iterator UI = GV->use_begin(), UE = GV->use_end(); + UI != UE;) { + Use &U = (UI++).getUse(); + U.set(BitCastNewGV); + } + std::string Name = GV->getName(); + GV->removeDeadConstantUsers(); + GV->eraseFromParent(); + NewGV->setName(Name); + } + GVMap.clear(); + + return true; +} + +Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F, + GlobalVariable *GV, + IRBuilder<> &Builder) { + PointerType *GVType = GV->getType(); + Value *CVTA = NULL; + + // See if the address space conversion requires the operand to be bitcast + // to i8 addrspace(n)* first. + EVT ExtendedGVType = EVT::getEVT(GVType->getElementType(), true); + if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) { + // A bitcast to i8 addrspace(n)* on the operand is needed. + LLVMContext &Context = M->getContext(); + unsigned int AddrSpace = GVType->getAddressSpace(); + Type *DestTy = PointerType::get(Type::getInt8Ty(Context), AddrSpace); + CVTA = Builder.CreateBitCast(GV, DestTy, "cvta"); + // Insert the address space conversion. + Type *ResultType = + PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC); + SmallVector<Type *, 2> ParamTypes; + ParamTypes.push_back(ResultType); + ParamTypes.push_back(DestTy); + Function *CVTAFunction = Intrinsic::getDeclaration( + M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes); + CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta"); + // Another bitcast from i8 * to <the element type of GVType> * is + // required. + DestTy = + PointerType::get(GVType->getElementType(), llvm::ADDRESS_SPACE_GENERIC); + CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta"); + } else { + // A simple CVTA is enough. + SmallVector<Type *, 2> ParamTypes; + ParamTypes.push_back(PointerType::get(GVType->getElementType(), + llvm::ADDRESS_SPACE_GENERIC)); + ParamTypes.push_back(GVType); + Function *CVTAFunction = Intrinsic::getDeclaration( + M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes); + CVTA = Builder.CreateCall(CVTAFunction, GV, "cvta"); + } + + return CVTA; +} + +Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C, + IRBuilder<> &Builder) { + // If the constant C has been converted already in the given function F, just + // return the converted value. + ConstantToValueMapTy::iterator CTII = ConstantToValueMap.find(C); + if (CTII != ConstantToValueMap.end()) { + return CTII->second; + } + + Value *NewValue = C; + if (isa<GlobalVariable>(C)) { + // If the constant C is a global variable and is found in GVMap, generate a + // set set of instructions that convert the clone of C with the global + // address space specifier to a generic pointer. + // The constant C cannot be used here, as it will be erased from the + // module eventually. And the clone of C with the global address space + // specifier cannot be used here either, as it will affect the types of + // other instructions in the function. Hence, this address space conversion + // is required. + GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(C)); + if (I != GVMap.end()) { + NewValue = getOrInsertCVTA(M, F, I->second, Builder); + } + } else if (isa<ConstantVector>(C) || isa<ConstantArray>(C) || + isa<ConstantStruct>(C)) { + // If any element in the constant vector or aggregate C is or uses a global + // variable in GVMap, the constant C needs to be reconstructed, using a set + // of instructions. + NewValue = remapConstantVectorOrConstantAggregate(M, F, C, Builder); + } else if (isa<ConstantExpr>(C)) { + // If any operand in the constant expression C is or uses a global variable + // in GVMap, the constant expression C needs to be reconstructed, using a + // set of instructions. + NewValue = remapConstantExpr(M, F, cast<ConstantExpr>(C), Builder); + } + + ConstantToValueMap[C] = NewValue; + return NewValue; +} + +Value *GenericToNVVM::remapConstantVectorOrConstantAggregate( + Module *M, Function *F, Constant *C, IRBuilder<> &Builder) { + bool OperandChanged = false; + SmallVector<Value *, 4> NewOperands; + unsigned NumOperands = C->getNumOperands(); + + // Check if any element is or uses a global variable in GVMap, and thus + // converted to another value. + for (unsigned i = 0; i < NumOperands; ++i) { + Value *Operand = C->getOperand(i); + Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder); + OperandChanged |= Operand != NewOperand; + NewOperands.push_back(NewOperand); + } + + // If none of the elements has been modified, return C as it is. + if (!OperandChanged) { + return C; + } + + // If any of the elements has been modified, construct the equivalent + // vector or aggregate value with a set instructions and the converted + // elements. + Value *NewValue = UndefValue::get(C->getType()); + if (isa<ConstantVector>(C)) { + for (unsigned i = 0; i < NumOperands; ++i) { + Value *Idx = ConstantInt::get(Type::getInt32Ty(M->getContext()), i); + NewValue = Builder.CreateInsertElement(NewValue, NewOperands[i], Idx); + } + } else { + for (unsigned i = 0; i < NumOperands; ++i) { + NewValue = + Builder.CreateInsertValue(NewValue, NewOperands[i], makeArrayRef(i)); + } + } + + return NewValue; +} + +Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C, + IRBuilder<> &Builder) { + bool OperandChanged = false; + SmallVector<Value *, 4> NewOperands; + unsigned NumOperands = C->getNumOperands(); + + // Check if any operand is or uses a global variable in GVMap, and thus + // converted to another value. + for (unsigned i = 0; i < NumOperands; ++i) { + Value *Operand = C->getOperand(i); + Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder); + OperandChanged |= Operand != NewOperand; + NewOperands.push_back(NewOperand); + } + + // If none of the operands has been modified, return C as it is. + if (!OperandChanged) { + return C; + } + + // If any of the operands has been modified, construct the instruction with + // the converted operands. + unsigned Opcode = C->getOpcode(); + switch (Opcode) { + case Instruction::ICmp: + // CompareConstantExpr (icmp) + return Builder.CreateICmp(CmpInst::Predicate(C->getPredicate()), + NewOperands[0], NewOperands[1]); + case Instruction::FCmp: + // CompareConstantExpr (fcmp) + assert(false && "Address space conversion should have no effect " + "on float point CompareConstantExpr (fcmp)!"); + return C; + case Instruction::ExtractElement: + // ExtractElementConstantExpr + return Builder.CreateExtractElement(NewOperands[0], NewOperands[1]); + case Instruction::InsertElement: + // InsertElementConstantExpr + return Builder.CreateInsertElement(NewOperands[0], NewOperands[1], + NewOperands[2]); + case Instruction::ShuffleVector: + // ShuffleVector + return Builder.CreateShuffleVector(NewOperands[0], NewOperands[1], + NewOperands[2]); + case Instruction::ExtractValue: + // ExtractValueConstantExpr + return Builder.CreateExtractValue(NewOperands[0], C->getIndices()); + case Instruction::InsertValue: + // InsertValueConstantExpr + return Builder.CreateInsertValue(NewOperands[0], NewOperands[1], + C->getIndices()); + case Instruction::GetElementPtr: + // GetElementPtrConstantExpr + return cast<GEPOperator>(C)->isInBounds() + ? Builder.CreateGEP( + NewOperands[0], + makeArrayRef(&NewOperands[1], NumOperands - 1)) + : Builder.CreateInBoundsGEP( + NewOperands[0], + makeArrayRef(&NewOperands[1], NumOperands - 1)); + case Instruction::Select: + // SelectConstantExpr + return Builder.CreateSelect(NewOperands[0], NewOperands[1], NewOperands[2]); + default: + // BinaryConstantExpr + if (Instruction::isBinaryOp(Opcode)) { + return Builder.CreateBinOp(Instruction::BinaryOps(C->getOpcode()), + NewOperands[0], NewOperands[1]); + } + // UnaryConstantExpr + if (Instruction::isCast(Opcode)) { + return Builder.CreateCast(Instruction::CastOps(C->getOpcode()), + NewOperands[0], C->getType()); + } + assert(false && "GenericToNVVM encountered an unsupported ConstantExpr"); + return C; + } +} + +void GenericToNVVM::remapNamedMDNode(Module *M, NamedMDNode *N) { + + bool OperandChanged = false; + SmallVector<MDNode *, 16> NewOperands; + unsigned NumOperands = N->getNumOperands(); + + // Check if any operand is or contains a global variable in GVMap, and thus + // converted to another value. + for (unsigned i = 0; i < NumOperands; ++i) { + MDNode *Operand = N->getOperand(i); + MDNode *NewOperand = remapMDNode(M, Operand); + OperandChanged |= Operand != NewOperand; + NewOperands.push_back(NewOperand); + } + + // If none of the operands has been modified, return immediately. + if (!OperandChanged) { + return; + } + + // Replace the old operands with the new operands. + N->dropAllReferences(); + for (SmallVector<MDNode *, 16>::iterator I = NewOperands.begin(), + E = NewOperands.end(); + I != E; ++I) { + N->addOperand(*I); + } +} + +MDNode *GenericToNVVM::remapMDNode(Module *M, MDNode *N) { + + bool OperandChanged = false; + SmallVector<Value *, 8> NewOperands; + unsigned NumOperands = N->getNumOperands(); + + // Check if any operand is or contains a global variable in GVMap, and thus + // converted to another value. + for (unsigned i = 0; i < NumOperands; ++i) { + Value *Operand = N->getOperand(i); + Value *NewOperand = Operand; + if (Operand) { + if (isa<GlobalVariable>(Operand)) { + GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(Operand)); + if (I != GVMap.end()) { + NewOperand = I->second; + if (++i < NumOperands) { + NewOperands.push_back(NewOperand); + // Address space of the global variable follows the global variable + // in the global variable debug info (see createGlobalVariable in + // lib/Analysis/DIBuilder.cpp). + NewOperand = + ConstantInt::get(Type::getInt32Ty(M->getContext()), + I->second->getType()->getAddressSpace()); + } + } + } else if (isa<MDNode>(Operand)) { + NewOperand = remapMDNode(M, cast<MDNode>(Operand)); + } + } + OperandChanged |= Operand != NewOperand; + NewOperands.push_back(NewOperand); + } + + // If none of the operands has been modified, return N as it is. + if (!OperandChanged) { + return N; + } + + // If any of the operands has been modified, create a new MDNode with the new + // operands. + return MDNode::get(M->getContext(), makeArrayRef(NewOperands)); +} diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 0f4c8dbce5..d4378c2322 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -42,6 +42,11 @@ static cl::opt<int> UsePrecDivF32( " IEEE Compliant F32 div.rnd if avaiable."), cl::init(2)); +static cl::opt<bool> +UsePrecSqrtF32("nvptx-prec-sqrtf32", + cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), + cl::init(true)); + /// createNVPTXISelDag - This pass converts a legalized DAG into a /// NVPTX-specific DAG, ready for instruction scheduling. FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM, @@ -74,6 +79,8 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm, // Decide how to translate f32 div do_DIVF32_PREC = UsePrecDivF32; + // Decide how to translate f32 sqrt + do_SQRTF32_PREC = UsePrecSqrtF32; // sm less than sm_20 does not support div.rnd. Use div.full. if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20()) do_DIVF32_PREC = 1; diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h index 70e8e46429..ed16d4450b 100644 --- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h +++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h @@ -41,6 +41,10 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel { // Otherwise, use div.full int do_DIVF32_PREC; + // If true, generate sqrt.rn, else generate sqrt.approx. If FTZ + // is true, then generate the corresponding FTZ version. + bool do_SQRTF32_PREC; + // If true, add .ftz to f32 instructions. // This is only meaningful for sm_20 and later, as the default // is not ftz. diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index f43abe283b..da6dd39b93 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -75,6 +75,9 @@ def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">; def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">; def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">; +def do_SQRTF32_APPROX : Predicate<"do_SQRTF32_PREC==0">; +def do_SQRTF32_RN : Predicate<"do_SQRTF32_PREC==1">; + def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">; def true : Predicate<"1">; diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td index 49e2568dfa..24037cafef 100644 --- a/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -512,6 +512,16 @@ def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs, def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs, Float64Regs, int_nvvm_sqrt_rp_d>; +// nvvm_sqrt intrinsic +def : Pat<(int_nvvm_sqrt_f Float32Regs:$a), + (INT_NVVM_SQRT_RN_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>; +def : Pat<(int_nvvm_sqrt_f Float32Regs:$a), + (INT_NVVM_SQRT_RN_F Float32Regs:$a)>, Requires<[do_SQRTF32_RN]>; +def : Pat<(int_nvvm_sqrt_f Float32Regs:$a), + (INT_NVVM_SQRT_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>; +def : Pat<(int_nvvm_sqrt_f Float32Regs:$a), + (INT_NVVM_SQRT_APPROX_F Float32Regs:$a)>; + // // Rsqrt // @@ -1510,38 +1520,12 @@ multiclass G_TO_NG<string Str, Intrinsic Intrin> { defm cvta_local : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>; defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen>; defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen>; +defm cvta_const : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen>; defm cvta_to_local : G_TO_NG<"local", int_nvvm_ptr_gen_to_local>; defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared>; defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global>; - -def cvta_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), - "mov.u32 \t$result, $src;", - [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen Int32Regs:$src))]>; -def cvta_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), - "mov.u64 \t$result, $src;", - [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen Int64Regs:$src))]>; - - - -// @TODO: Revisit this. There is a type -// contradiction between iPTRAny and iPTR for the def. -/*def cvta_const_addr : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src), - "mov.u32 \t$result, $src;", - [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen - (Wrapper tglobaladdr:$src)))]>; -def cvta_const_addr_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src), - "mov.u64 \t$result, $src;", - [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen - (Wrapper tglobaladdr:$src)))]>;*/ - - -def cvta_to_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src), - "mov.u32 \t$result, $src;", - [(set Int32Regs:$result, (int_nvvm_ptr_gen_to_constant Int32Regs:$src))]>; -def cvta_to_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src), - "mov.u64 \t$result, $src;", - [(set Int64Regs:$result, (int_nvvm_ptr_gen_to_constant Int64Regs:$src))]>; +defm cvta_to_const : G_TO_NG<"const", int_nvvm_ptr_gen_to_constant>; // nvvm.ptr.gen.to.param diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp index 67ca6b58e5..1ae2a7cc86 100644 --- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp +++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp @@ -49,6 +49,7 @@ using namespace llvm; namespace llvm { void initializeNVVMReflectPass(PassRegistry&); +void initializeGenericToNVVMPass(PassRegistry&); } extern "C" void LLVMInitializeNVPTXTarget() { @@ -62,6 +63,7 @@ extern "C" void LLVMInitializeNVPTXTarget() { // FIXME: This pass is really intended to be invoked during IR optimization, // but it's very NVPTX-specific. initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); + initializeGenericToNVVMPass(*PassRegistry::getPassRegistry()); } NVPTXTargetMachine::NVPTXTargetMachine( @@ -100,6 +102,7 @@ public: return getTM<NVPTXTargetMachine>(); } + virtual void addIRPasses(); virtual bool addInstSelector(); virtual bool addPreRegAlloc(); }; @@ -110,6 +113,11 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) { return PassConfig; } +void NVPTXPassConfig::addIRPasses() { + TargetPassConfig::addIRPasses(); + addPass(createGenericToNVVMPass()); +} + bool NVPTXPassConfig::addInstSelector() { addPass(createLowerAggrCopies()); addPass(createSplitBBatBarPass()); diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp index 0ad62ce39b..3cc324b85e 100644 --- a/lib/Target/NVPTX/NVVMReflect.cpp +++ b/lib/Target/NVPTX/NVVMReflect.cpp @@ -14,6 +14,7 @@ // //===----------------------------------------------------------------------===// +#include "NVPTX.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" @@ -40,7 +41,7 @@ using namespace llvm; namespace llvm { void initializeNVVMReflectPass(PassRegistry &); } namespace { -class LLVM_LIBRARY_VISIBILITY NVVMReflect : public ModulePass { +class NVVMReflect : public ModulePass { private: StringMap<int> VarMap; typedef DenseMap<std::string, int>::iterator VarMapIter; @@ -48,9 +49,18 @@ private: public: static char ID; - NVVMReflect() : ModulePass(ID) { + NVVMReflect() : ModulePass(ID), ReflectFunction(0) { + initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); VarMap.clear(); - ReflectFunction = 0; + } + + NVVMReflect(const StringMap<int> &Mapping) + : ModulePass(ID), ReflectFunction(0) { + initializeNVVMReflectPass(*PassRegistry::getPassRegistry()); + for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end(); + I != E; ++I) { + VarMap[(*I).getKey()] = (*I).getValue(); + } } void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); } @@ -60,6 +70,14 @@ public: }; } +ModulePass *llvm::createNVVMReflectPass() { + return new NVVMReflect(); +} + +ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping) { + return new NVVMReflect(Mapping); +} + static cl::opt<bool> NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true), cl::desc("NVVM reflection, enabled by default")); diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp index bd1c378681..3e608ca8f6 100644 --- a/lib/Target/PowerPC/PPCBranchSelector.cpp +++ b/lib/Target/PowerPC/PPCBranchSelector.cpp @@ -112,15 +112,21 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) { unsigned MBBStartOffset = 0; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ++I) { - if (I->getOpcode() != PPC::BCC || I->getOperand(2).isImm()) { + MachineBasicBlock *Dest = 0; + if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm()) + Dest = I->getOperand(2).getMBB(); + else if ((I->getOpcode() == PPC::BDNZ8 || I->getOpcode() == PPC::BDNZ || + I->getOpcode() == PPC::BDZ8 || I->getOpcode() == PPC::BDZ) && + !I->getOperand(0).isImm()) + Dest = I->getOperand(0).getMBB(); + + if (!Dest) { MBBStartOffset += TII->GetInstSizeInBytes(I); continue; } // Determine the offset from the current branch to the destination // block. - MachineBasicBlock *Dest = I->getOperand(2).getMBB(); - int BranchSize; if (Dest->getNumber() <= MBB.getNumber()) { // If this is a backwards branch, the delta is the offset from the diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td index b44d2482d5..d2620b2877 100644 --- a/lib/Target/R600/AMDGPUInstructions.td +++ b/lib/Target/R600/AMDGPUInstructions.td @@ -284,6 +284,17 @@ class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat < (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y) >; +// Bitfield extract patterns + +def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>; +def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}], + SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>; + +class BFEPattern <Instruction BFE> : Pat < + (and (srl i32:$x, legalshift32:$y), bfemask:$z), + (BFE $x, $y, $z) +>; + include "R600Instructions.td" include "SIInstrInfo.td" diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 0ec67ce13b..31fbf32d0c 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -115,7 +115,6 @@ AMDGPUPassConfig::addPreISel() { } bool AMDGPUPassConfig::addInstSelector() { - addPass(createAMDGPUPeepholeOpt(*TM)); addPass(createAMDGPUISelDag(getAMDGPUTargetMachine())); const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); diff --git a/lib/Target/R600/AMDILDeviceInfo.cpp b/lib/Target/R600/AMDILDeviceInfo.cpp index 178795936a..126514b976 100644 --- a/lib/Target/R600/AMDILDeviceInfo.cpp +++ b/lib/Target/R600/AMDILDeviceInfo.cpp @@ -81,7 +81,8 @@ AMDGPUDevice* getDeviceFromName(const std::string &deviceName, return new AMDGPUNIDevice(ptr); } else if (deviceName == "SI" || deviceName == "tahiti" || deviceName == "pitcairn" || - deviceName == "verde" || deviceName == "oland") { + deviceName == "verde" || deviceName == "oland" || + deviceName == "hainan") { return new AMDGPUSIDevice(ptr); } else { #if DEBUG diff --git a/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/lib/Target/R600/AMDILPeepholeOptimizer.cpp deleted file mode 100644 index 3a28038666..0000000000 --- a/lib/Target/R600/AMDILPeepholeOptimizer.cpp +++ /dev/null @@ -1,1215 +0,0 @@ -//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -/// \file -//==-----------------------------------------------------------------------===// - -#define DEBUG_TYPE "PeepholeOpt" -#ifdef DEBUG -#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE)) -#else -#define DEBUGME 0 -#endif - -#include "AMDILDevices.h" -#include "AMDGPUInstrInfo.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/ADT/StringExtras.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/ADT/Twine.h" -#include "llvm/IR/Constants.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionAnalysis.h" -#include "llvm/IR/Function.h" -#include "llvm/IR/Instructions.h" -#include "llvm/IR/Module.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/MathExtras.h" - -#include <sstream> - -#if 0 -STATISTIC(PointerAssignments, "Number of dynamic pointer " - "assigments discovered"); -STATISTIC(PointerSubtract, "Number of pointer subtractions discovered"); -#endif - -using namespace llvm; -// The Peephole optimization pass is used to do simple last minute optimizations -// that are required for correct code or to remove redundant functions -namespace { - -class OpaqueType; - -class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass { -public: - TargetMachine &TM; - static char ID; - AMDGPUPeepholeOpt(TargetMachine &tm); - ~AMDGPUPeepholeOpt(); - const char *getPassName() const; - bool runOnFunction(Function &F); - bool doInitialization(Module &M); - bool doFinalization(Module &M); - void getAnalysisUsage(AnalysisUsage &AU) const; -protected: -private: - // Function to initiate all of the instruction level optimizations. - bool instLevelOptimizations(BasicBlock::iterator *inst); - // Quick check to see if we need to dump all of the pointers into the - // arena. If this is correct, then we set all pointers to exist in arena. This - // is a workaround for aliasing of pointers in a struct/union. - bool dumpAllIntoArena(Function &F); - // Because I don't want to invalidate any pointers while in the - // safeNestedForEachFunction. I push atomic conversions to a vector and handle - // it later. This function does the conversions if required. - void doAtomicConversionIfNeeded(Function &F); - // Because __amdil_is_constant cannot be properly evaluated if - // optimizations are disabled, the call's are placed in a vector - // and evaluated after the __amdil_image* functions are evaluated - // which should allow the __amdil_is_constant function to be - // evaluated correctly. - void doIsConstCallConversionIfNeeded(); - bool mChanged; - bool mDebug; - bool mConvertAtomics; - CodeGenOpt::Level optLevel; - // Run a series of tests to see if we can optimize a CALL instruction. - bool optimizeCallInst(BasicBlock::iterator *bbb); - // A peephole optimization to optimize bit extract sequences. - bool optimizeBitExtract(Instruction *inst); - // A peephole optimization to optimize bit insert sequences. - bool optimizeBitInsert(Instruction *inst); - bool setupBitInsert(Instruction *base, - Instruction *&src, - Constant *&mask, - Constant *&shift); - // Expand the bit field insert instruction on versions of OpenCL that - // don't support it. - bool expandBFI(CallInst *CI); - // Expand the bit field mask instruction on version of OpenCL that - // don't support it. - bool expandBFM(CallInst *CI); - // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in - // this case we need to expand them. These functions check for 24bit functions - // and then expand. - bool isSigned24BitOps(CallInst *CI); - void expandSigned24BitOps(CallInst *CI); - // One optimization that can occur is that if the required workgroup size is - // specified then the result of get_local_size is known at compile time and - // can be returned accordingly. - bool isRWGLocalOpt(CallInst *CI); - // On northern island cards, the division is slightly less accurate than on - // previous generations, so we need to utilize a more accurate division. So we - // can translate the accurate divide to a normal divide on all other cards. - bool convertAccurateDivide(CallInst *CI); - void expandAccurateDivide(CallInst *CI); - // If the alignment is set incorrectly, it can produce really inefficient - // code. This checks for this scenario and fixes it if possible. - bool correctMisalignedMemOp(Instruction *inst); - - // If we are in no opt mode, then we need to make sure that - // local samplers are properly propagated as constant propagation - // doesn't occur and we need to know the value of kernel defined - // samplers at compile time. - bool propagateSamplerInst(CallInst *CI); - - // Helper functions - - // Group of functions that recursively calculate the size of a structure based - // on it's sub-types. - size_t getTypeSize(Type * const T, bool dereferencePtr = false); - size_t getTypeSize(StructType * const ST, bool dereferencePtr = false); - size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false); - size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false); - size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false); - size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false); - size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false); - size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false); - - LLVMContext *mCTX; - Function *mF; - const AMDGPUSubtarget *mSTM; - SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs; - SmallVector<CallInst *, 16> isConstVec; -}; // class AMDGPUPeepholeOpt - char AMDGPUPeepholeOpt::ID = 0; - -// A template function that has two levels of looping before calling the -// function with a pointer to the current iterator. -template<class InputIterator, class SecondIterator, class Function> -Function safeNestedForEach(InputIterator First, InputIterator Last, - SecondIterator S, Function F) { - for ( ; First != Last; ++First) { - SecondIterator sf, sl; - for (sf = First->begin(), sl = First->end(); - sf != sl; ) { - if (!F(&sf)) { - ++sf; - } - } - } - return F; -} - -} // anonymous namespace - -namespace llvm { - FunctionPass * - createAMDGPUPeepholeOpt(TargetMachine &tm) { - return new AMDGPUPeepholeOpt(tm); - } -} // llvm namespace - -AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm) - : FunctionPass(ID), TM(tm) { - mDebug = DEBUGME; - optLevel = TM.getOptLevel(); - -} - -AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() { -} - -const char * -AMDGPUPeepholeOpt::getPassName() const { - return "AMDGPU PeepHole Optimization Pass"; -} - -bool -containsPointerType(Type *Ty) { - if (!Ty) { - return false; - } - switch(Ty->getTypeID()) { - default: - return false; - case Type::StructTyID: { - const StructType *ST = dyn_cast<StructType>(Ty); - for (StructType::element_iterator stb = ST->element_begin(), - ste = ST->element_end(); stb != ste; ++stb) { - if (!containsPointerType(*stb)) { - continue; - } - return true; - } - break; - } - case Type::VectorTyID: - case Type::ArrayTyID: - return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType()); - case Type::PointerTyID: - return true; - }; - return false; -} - -bool -AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) { - bool dumpAll = false; - for (Function::const_arg_iterator cab = F.arg_begin(), - cae = F.arg_end(); cab != cae; ++cab) { - const Argument *arg = cab; - const PointerType *PT = dyn_cast<PointerType>(arg->getType()); - if (!PT) { - continue; - } - Type *DereferencedType = PT->getElementType(); - if (!dyn_cast<StructType>(DereferencedType) - ) { - continue; - } - if (!containsPointerType(DereferencedType)) { - continue; - } - // FIXME: Because a pointer inside of a struct/union may be aliased to - // another pointer we need to take the conservative approach and place all - // pointers into the arena until more advanced detection is implemented. - dumpAll = true; - } - return dumpAll; -} -void -AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() { - if (isConstVec.empty()) { - return; - } - for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) { - CallInst *CI = isConstVec[x]; - Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); - Type *aType = Type::getInt32Ty(*mCTX); - Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) - : ConstantInt::get(aType, 0); - CI->replaceAllUsesWith(Val); - CI->eraseFromParent(); - } - isConstVec.clear(); -} -void -AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) { - // Don't do anything if we don't have any atomic operations. - if (atomicFuncs.empty()) { - return; - } - // Change the function name for the atomic if it is required - uint32_t size = atomicFuncs.size(); - for (uint32_t x = 0; x < size; ++x) { - atomicFuncs[x].first->setOperand( - atomicFuncs[x].first->getNumOperands()-1, - atomicFuncs[x].second); - - } - mChanged = true; - if (mConvertAtomics) { - return; - } -} - -bool -AMDGPUPeepholeOpt::runOnFunction(Function &MF) { - mChanged = false; - mF = &MF; - mSTM = &TM.getSubtarget<AMDGPUSubtarget>(); - if (mDebug) { - MF.dump(); - } - mCTX = &MF.getType()->getContext(); - mConvertAtomics = true; - safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(), - std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations), - this)); - - doAtomicConversionIfNeeded(MF); - doIsConstCallConversionIfNeeded(); - - if (mDebug) { - MF.dump(); - } - return mChanged; -} - -bool -AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) { - Instruction *inst = (*bbb); - CallInst *CI = dyn_cast<CallInst>(inst); - if (!CI) { - return false; - } - if (isSigned24BitOps(CI)) { - expandSigned24BitOps(CI); - ++(*bbb); - CI->eraseFromParent(); - return true; - } - if (propagateSamplerInst(CI)) { - return false; - } - if (expandBFI(CI) || expandBFM(CI)) { - ++(*bbb); - CI->eraseFromParent(); - return true; - } - if (convertAccurateDivide(CI)) { - expandAccurateDivide(CI); - ++(*bbb); - CI->eraseFromParent(); - return true; - } - - StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName(); - if (calleeName.startswith("__amdil_is_constant")) { - // If we do not have optimizations, then this - // cannot be properly evaluated, so we add the - // call instruction to a vector and process - // them at the end of processing after the - // samplers have been correctly handled. - if (optLevel == CodeGenOpt::None) { - isConstVec.push_back(CI); - return false; - } else { - Constant *CV = dyn_cast<Constant>(CI->getOperand(0)); - Type *aType = Type::getInt32Ty(*mCTX); - Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1) - : ConstantInt::get(aType, 0); - CI->replaceAllUsesWith(Val); - ++(*bbb); - CI->eraseFromParent(); - return true; - } - } - - if (calleeName.equals("__amdil_is_asic_id_i32")) { - ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0)); - Type *aType = Type::getInt32Ty(*mCTX); - Value *Val = CV; - if (Val) { - Val = ConstantInt::get(aType, - mSTM->device()->getDeviceFlag() & CV->getZExtValue()); - } else { - Val = ConstantInt::get(aType, 0); - } - CI->replaceAllUsesWith(Val); - ++(*bbb); - CI->eraseFromParent(); - return true; - } - Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1)); - if (!F) { - return false; - } - if (F->getName().startswith("__atom") && !CI->getNumUses() - && F->getName().find("_xchg") == StringRef::npos) { - std::string buffer(F->getName().str() + "_noret"); - F = dyn_cast<Function>( - F->getParent()->getOrInsertFunction(buffer, F->getFunctionType())); - atomicFuncs.push_back(std::make_pair(CI, F)); - } - - if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment) - && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) { - return false; - } - if (!mConvertAtomics) { - return false; - } - StringRef name = F->getName(); - if (name.startswith("__atom") && name.find("_g") != StringRef::npos) { - mConvertAtomics = false; - } - return false; -} - -bool -AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, - Instruction *&src, - Constant *&mask, - Constant *&shift) { - if (!base) { - if (mDebug) { - dbgs() << "Null pointer passed into function.\n"; - } - return false; - } - bool andOp = false; - if (base->getOpcode() == Instruction::Shl) { - shift = dyn_cast<Constant>(base->getOperand(1)); - } else if (base->getOpcode() == Instruction::And) { - mask = dyn_cast<Constant>(base->getOperand(1)); - andOp = true; - } else { - if (mDebug) { - dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n"; - } - // If the base is neither a Shl or a And, we don't fit any of the patterns above. - return false; - } - src = dyn_cast<Instruction>(base->getOperand(0)); - if (!src) { - if (mDebug) { - dbgs() << "Failed setup since the base operand is not an instruction!\n"; - } - return false; - } - // If we find an 'and' operation, then we don't need to - // find the next operation as we already know the - // bits that are valid at this point. - if (andOp) { - return true; - } - if (src->getOpcode() == Instruction::Shl && !shift) { - shift = dyn_cast<Constant>(src->getOperand(1)); - src = dyn_cast<Instruction>(src->getOperand(0)); - } else if (src->getOpcode() == Instruction::And && !mask) { - mask = dyn_cast<Constant>(src->getOperand(1)); - } - if (!mask && !shift) { - if (mDebug) { - dbgs() << "Failed setup since both mask and shift are NULL!\n"; - } - // Did not find a constant mask or a shift. - return false; - } - return true; -} -bool -AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) { - if (!inst) { - return false; - } - if (!inst->isBinaryOp()) { - return false; - } - if (inst->getOpcode() != Instruction::Or) { - return false; - } - if (optLevel == CodeGenOpt::None) { - return false; - } - // We want to do an optimization on a sequence of ops that in the end equals a - // single ISA instruction. - // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F) - // Some simplified versions of this pattern are as follows: - // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0 - // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E - // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B - // (A & B) | (D << F) when (1 << F) >= B - // (A << C) | (D & E) when (1 << C) >= E - if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { - // The HD4XXX hardware doesn't support the ubit_insert instruction. - return false; - } - Type *aType = inst->getType(); - bool isVector = aType->isVectorTy(); - int numEle = 1; - // This optimization only works on 32bit integers. - if (aType->getScalarType() - != Type::getInt32Ty(inst->getContext())) { - return false; - } - if (isVector) { - const VectorType *VT = dyn_cast<VectorType>(aType); - numEle = VT->getNumElements(); - // We currently cannot support more than 4 elements in a intrinsic and we - // cannot support Vec3 types. - if (numEle > 4 || numEle == 3) { - return false; - } - } - // TODO: Handle vectors. - if (isVector) { - if (mDebug) { - dbgs() << "!!! Vectors are not supported yet!\n"; - } - return false; - } - Instruction *LHSSrc = NULL, *RHSSrc = NULL; - Constant *LHSMask = NULL, *RHSMask = NULL; - Constant *LHSShift = NULL, *RHSShift = NULL; - Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0)); - Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1)); - if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) { - if (mDebug) { - dbgs() << "Found an OR Operation that failed setup!\n"; - inst->dump(); - if (LHS) { LHS->dump(); } - if (LHSSrc) { LHSSrc->dump(); } - if (LHSMask) { LHSMask->dump(); } - if (LHSShift) { LHSShift->dump(); } - } - // There was an issue with the setup for BitInsert. - return false; - } - if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) { - if (mDebug) { - dbgs() << "Found an OR Operation that failed setup!\n"; - inst->dump(); - if (RHS) { RHS->dump(); } - if (RHSSrc) { RHSSrc->dump(); } - if (RHSMask) { RHSMask->dump(); } - if (RHSShift) { RHSShift->dump(); } - } - // There was an issue with the setup for BitInsert. - return false; - } - if (mDebug) { - dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n"; - dbgs() << "Op: "; inst->dump(); - dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; } - dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; } - } - Constant *offset = NULL; - Constant *width = NULL; - uint32_t lhsMaskVal = 0, rhsMaskVal = 0; - uint32_t lhsShiftVal = 0, rhsShiftVal = 0; - uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0; - uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0; - lhsMaskVal = (LHSMask - ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0); - rhsMaskVal = (RHSMask - ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0); - lhsShiftVal = (LHSShift - ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0); - rhsShiftVal = (RHSShift - ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0); - lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal; - rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal; - lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal; - rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal; - // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks). - if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) { - return false; - } - if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) { - offset = ConstantInt::get(aType, lhsMaskOffset, false); - width = ConstantInt::get(aType, lhsMaskWidth, false); - RHSSrc = RHS; - if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) { - return false; - } - if (!LHSShift) { - LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, - "MaskShr", LHS); - } else if (lhsShiftVal != lhsMaskOffset) { - LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, - "MaskShr", LHS); - } - if (mDebug) { - dbgs() << "Optimizing LHS!\n"; - } - } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) { - offset = ConstantInt::get(aType, rhsMaskOffset, false); - width = ConstantInt::get(aType, rhsMaskWidth, false); - LHSSrc = RHSSrc; - RHSSrc = LHS; - if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) { - return false; - } - if (!RHSShift) { - LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, - "MaskShr", RHS); - } else if (rhsShiftVal != rhsMaskOffset) { - LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset, - "MaskShr", RHS); - } - if (mDebug) { - dbgs() << "Optimizing RHS!\n"; - } - } else { - if (mDebug) { - dbgs() << "Failed constraint 3!\n"; - } - return false; - } - if (mDebug) { - dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; } - dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; } - dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; } - dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; } - } - if (!offset || !width) { - if (mDebug) { - dbgs() << "Either width or offset are NULL, failed detection!\n"; - } - return false; - } - // Lets create the function signature. - std::vector<Type *> callTypes; - callTypes.push_back(aType); - callTypes.push_back(aType); - callTypes.push_back(aType); - callTypes.push_back(aType); - FunctionType *funcType = FunctionType::get(aType, callTypes, false); - std::string name = "__amdil_ubit_insert"; - if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; } - Function *Func = - dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> - getOrInsertFunction(StringRef(name), funcType)); - Value *Operands[4] = { - width, - offset, - LHSSrc, - RHSSrc - }; - CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt"); - if (mDebug) { - dbgs() << "Old Inst: "; - inst->dump(); - dbgs() << "New Inst: "; - CI->dump(); - dbgs() << "\n\n"; - } - CI->insertBefore(inst); - inst->replaceAllUsesWith(CI); - return true; -} - -bool -AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) { - if (!inst) { - return false; - } - if (!inst->isBinaryOp()) { - return false; - } - if (inst->getOpcode() != Instruction::And) { - return false; - } - if (optLevel == CodeGenOpt::None) { - return false; - } - // We want to do some simple optimizations on Shift right/And patterns. The - // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a - // value smaller than 32 and C is a mask. If C is a constant value, then the - // following transformation can occur. For signed integers, it turns into the - // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned - // integers, it turns into the function call dst = - // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract - // can be found in Section 7.9 of the ATI IL spec of the stream SDK for - // Evergreen hardware. - if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) { - // This does not work on HD4XXX hardware. - return false; - } - Type *aType = inst->getType(); - bool isVector = aType->isVectorTy(); - - // XXX Support vector types - if (isVector) { - return false; - } - int numEle = 1; - // This only works on 32bit integers - if (aType->getScalarType() - != Type::getInt32Ty(inst->getContext())) { - return false; - } - if (isVector) { - const VectorType *VT = dyn_cast<VectorType>(aType); - numEle = VT->getNumElements(); - // We currently cannot support more than 4 elements in a intrinsic and we - // cannot support Vec3 types. - if (numEle > 4 || numEle == 3) { - return false; - } - } - BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0)); - // If the first operand is not a shift instruction, then we can return as it - // doesn't match this pattern. - if (!ShiftInst || !ShiftInst->isShift()) { - return false; - } - // If we are a shift left, then we need don't match this pattern. - if (ShiftInst->getOpcode() == Instruction::Shl) { - return false; - } - bool isSigned = ShiftInst->isArithmeticShift(); - Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1)); - Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1)); - // Lets make sure that the shift value and the and mask are constant integers. - if (!AndMask || !ShrVal) { - return false; - } - Constant *newMaskConst; - Constant *shiftValConst; - if (isVector) { - // Handle the vector case - std::vector<Constant *> maskVals; - std::vector<Constant *> shiftVals; - ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask); - ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal); - Type *scalarType = AndMaskVec->getType()->getScalarType(); - assert(AndMaskVec->getNumOperands() == - ShrValVec->getNumOperands() && "cannot have a " - "combination where the number of elements to a " - "shift and an and are different!"); - for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) { - ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x)); - ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x)); - if (!AndCI || !ShiftIC) { - return false; - } - uint32_t maskVal = (uint32_t)AndCI->getZExtValue(); - if (!isMask_32(maskVal)) { - return false; - } - maskVal = (uint32_t)CountTrailingOnes_32(maskVal); - uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue(); - // If the mask or shiftval is greater than the bitcount, then break out. - if (maskVal >= 32 || shiftVal >= 32) { - return false; - } - // If the mask val is greater than the the number of original bits left - // then this optimization is invalid. - if (maskVal > (32 - shiftVal)) { - return false; - } - maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned)); - shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned)); - } - newMaskConst = ConstantVector::get(maskVals); - shiftValConst = ConstantVector::get(shiftVals); - } else { - // Handle the scalar case - uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue(); - // This must be a mask value where all lower bits are set to 1 and then any - // bit higher is set to 0. - if (!isMask_32(maskVal)) { - return false; - } - maskVal = (uint32_t)CountTrailingOnes_32(maskVal); - // Count the number of bits set in the mask, this is the width of the - // resulting bit set that is extracted from the source value. - uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue(); - // If the mask or shift val is greater than the bitcount, then break out. - if (maskVal >= 32 || shiftVal >= 32) { - return false; - } - // If the mask val is greater than the the number of original bits left then - // this optimization is invalid. - if (maskVal > (32 - shiftVal)) { - return false; - } - newMaskConst = ConstantInt::get(aType, maskVal, isSigned); - shiftValConst = ConstantInt::get(aType, shiftVal, isSigned); - } - // Lets create the function signature. - std::vector<Type *> callTypes; - callTypes.push_back(aType); - callTypes.push_back(aType); - callTypes.push_back(aType); - FunctionType *funcType = FunctionType::get(aType, callTypes, false); - std::string name = "llvm.AMDGPU.bit.extract.u32"; - if (isVector) { - name += ".v" + itostr(numEle) + "i32"; - } else { - name += "."; - } - // Lets create the function. - Function *Func = - dyn_cast<Function>(inst->getParent()->getParent()->getParent()-> - getOrInsertFunction(StringRef(name), funcType)); - Value *Operands[3] = { - ShiftInst->getOperand(0), - shiftValConst, - newMaskConst - }; - // Lets create the Call with the operands - CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt"); - CI->setDoesNotAccessMemory(); - CI->insertBefore(inst); - inst->replaceAllUsesWith(CI); - return true; -} - -bool -AMDGPUPeepholeOpt::expandBFI(CallInst *CI) { - if (!CI) { - return false; - } - Value *LHS = CI->getOperand(CI->getNumOperands() - 1); - if (!LHS->getName().startswith("__amdil_bfi")) { - return false; - } - Type* type = CI->getOperand(0)->getType(); - Constant *negOneConst = NULL; - if (type->isVectorTy()) { - std::vector<Constant *> negOneVals; - negOneConst = ConstantInt::get(CI->getContext(), - APInt(32, StringRef("-1"), 10)); - for (size_t x = 0, - y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { - negOneVals.push_back(negOneConst); - } - negOneConst = ConstantVector::get(negOneVals); - } else { - negOneConst = ConstantInt::get(CI->getContext(), - APInt(32, StringRef("-1"), 10)); - } - // __amdil_bfi => (A & B) | (~A & C) - BinaryOperator *lhs = - BinaryOperator::Create(Instruction::And, CI->getOperand(0), - CI->getOperand(1), "bfi_and", CI); - BinaryOperator *rhs = - BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst, - "bfi_not", CI); - rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2), - "bfi_and", CI); - lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI); - CI->replaceAllUsesWith(lhs); - return true; -} - -bool -AMDGPUPeepholeOpt::expandBFM(CallInst *CI) { - if (!CI) { - return false; - } - Value *LHS = CI->getOperand(CI->getNumOperands() - 1); - if (!LHS->getName().startswith("__amdil_bfm")) { - return false; - } - // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f) - Constant *newMaskConst = NULL; - Constant *newShiftConst = NULL; - Type* type = CI->getOperand(0)->getType(); - if (type->isVectorTy()) { - std::vector<Constant*> newMaskVals, newShiftVals; - newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); - newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); - for (size_t x = 0, - y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) { - newMaskVals.push_back(newMaskConst); - newShiftVals.push_back(newShiftConst); - } - newMaskConst = ConstantVector::get(newMaskVals); - newShiftConst = ConstantVector::get(newShiftVals); - } else { - newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F); - newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1); - } - BinaryOperator *lhs = - BinaryOperator::Create(Instruction::And, CI->getOperand(0), - newMaskConst, "bfm_mask", CI); - lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst, - lhs, "bfm_shl", CI); - lhs = BinaryOperator::Create(Instruction::Sub, lhs, - newShiftConst, "bfm_sub", CI); - BinaryOperator *rhs = - BinaryOperator::Create(Instruction::And, CI->getOperand(1), - newMaskConst, "bfm_mask", CI); - lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI); - CI->replaceAllUsesWith(lhs); - return true; -} - -bool -AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) { - Instruction *inst = (*bbb); - if (optimizeCallInst(bbb)) { - return true; - } - if (optimizeBitExtract(inst)) { - return false; - } - if (optimizeBitInsert(inst)) { - return false; - } - if (correctMisalignedMemOp(inst)) { - return false; - } - return false; -} -bool -AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) { - LoadInst *linst = dyn_cast<LoadInst>(inst); - StoreInst *sinst = dyn_cast<StoreInst>(inst); - unsigned alignment; - Type* Ty = inst->getType(); - if (linst) { - alignment = linst->getAlignment(); - Ty = inst->getType(); - } else if (sinst) { - alignment = sinst->getAlignment(); - Ty = sinst->getValueOperand()->getType(); - } else { - return false; - } - unsigned size = getTypeSize(Ty); - if (size == alignment || size < alignment) { - return false; - } - if (!Ty->isStructTy()) { - return false; - } - if (alignment < 4) { - if (linst) { - linst->setAlignment(0); - return true; - } else if (sinst) { - sinst->setAlignment(0); - return true; - } - } - return false; -} -bool -AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) { - if (!CI) { - return false; - } - Value *LHS = CI->getOperand(CI->getNumOperands() - 1); - std::string namePrefix = LHS->getName().substr(0, 14); - if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24" - && namePrefix != "__amdil__imul24_high") { - return false; - } - if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) { - return false; - } - return true; -} - -void -AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) { - assert(isSigned24BitOps(CI) && "Must be a " - "signed 24 bit operation to call this function!"); - Value *LHS = CI->getOperand(CI->getNumOperands()-1); - // On 7XX and 8XX we do not have signed 24bit, so we need to - // expand it to the following: - // imul24 turns into 32bit imul - // imad24 turns into 32bit imad - // imul24_high turns into 32bit imulhigh - if (LHS->getName().substr(0, 14) == "__amdil_imad24") { - Type *aType = CI->getOperand(0)->getType(); - bool isVector = aType->isVectorTy(); - int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; - std::vector<Type*> callTypes; - callTypes.push_back(CI->getOperand(0)->getType()); - callTypes.push_back(CI->getOperand(1)->getType()); - callTypes.push_back(CI->getOperand(2)->getType()); - FunctionType *funcType = - FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); - std::string name = "__amdil_imad"; - if (isVector) { - name += "_v" + itostr(numEle) + "i32"; - } else { - name += "_i32"; - } - Function *Func = dyn_cast<Function>( - CI->getParent()->getParent()->getParent()-> - getOrInsertFunction(StringRef(name), funcType)); - Value *Operands[3] = { - CI->getOperand(0), - CI->getOperand(1), - CI->getOperand(2) - }; - CallInst *nCI = CallInst::Create(Func, Operands, "imad24"); - nCI->insertBefore(CI); - CI->replaceAllUsesWith(nCI); - } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") { - BinaryOperator *mulOp = - BinaryOperator::Create(Instruction::Mul, CI->getOperand(0), - CI->getOperand(1), "imul24", CI); - CI->replaceAllUsesWith(mulOp); - } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") { - Type *aType = CI->getOperand(0)->getType(); - - bool isVector = aType->isVectorTy(); - int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1; - std::vector<Type*> callTypes; - callTypes.push_back(CI->getOperand(0)->getType()); - callTypes.push_back(CI->getOperand(1)->getType()); - FunctionType *funcType = - FunctionType::get(CI->getOperand(0)->getType(), callTypes, false); - std::string name = "__amdil_imul_high"; - if (isVector) { - name += "_v" + itostr(numEle) + "i32"; - } else { - name += "_i32"; - } - Function *Func = dyn_cast<Function>( - CI->getParent()->getParent()->getParent()-> - getOrInsertFunction(StringRef(name), funcType)); - Value *Operands[2] = { - CI->getOperand(0), - CI->getOperand(1) - }; - CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high"); - nCI->insertBefore(CI); - CI->replaceAllUsesWith(nCI); - } -} - -bool -AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) { - return (CI != NULL - && CI->getOperand(CI->getNumOperands() - 1)->getName() - == "__amdil_get_local_size_int"); -} - -bool -AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) { - if (!CI) { - return false; - } - if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX - && (mSTM->getDeviceName() == "cayman")) { - return false; - } - return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) - == "__amdil_improved_div"; -} - -void -AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) { - assert(convertAccurateDivide(CI) - && "expanding accurate divide can only happen if it is expandable!"); - BinaryOperator *divOp = - BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0), - CI->getOperand(1), "fdiv32", CI); - CI->replaceAllUsesWith(divOp); -} - -bool -AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) { - if (optLevel != CodeGenOpt::None) { - return false; - } - - if (!CI) { - return false; - } - - unsigned funcNameIdx = 0; - funcNameIdx = CI->getNumOperands() - 1; - StringRef calleeName = CI->getOperand(funcNameIdx)->getName(); - if (calleeName != "__amdil_image2d_read_norm" - && calleeName != "__amdil_image2d_read_unnorm" - && calleeName != "__amdil_image3d_read_norm" - && calleeName != "__amdil_image3d_read_unnorm") { - return false; - } - - unsigned samplerIdx = 2; - samplerIdx = 1; - Value *sampler = CI->getOperand(samplerIdx); - LoadInst *lInst = dyn_cast<LoadInst>(sampler); - if (!lInst) { - return false; - } - - if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) { - return false; - } - - GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand()); - // If we are loading from what is not a global value, then we - // fail and return. - if (!gv) { - return false; - } - - // If we don't have an initializer or we have an initializer and - // the initializer is not a 32bit integer, we fail. - if (!gv->hasInitializer() - || !gv->getInitializer()->getType()->isIntegerTy(32)) { - return false; - } - - // Now that we have the global variable initializer, lets replace - // all uses of the load instruction with the samplerVal and - // reparse the __amdil_is_constant() function. - Constant *samplerVal = gv->getInitializer(); - lInst->replaceAllUsesWith(samplerVal); - return true; -} - -bool -AMDGPUPeepholeOpt::doInitialization(Module &M) { - return false; -} - -bool -AMDGPUPeepholeOpt::doFinalization(Module &M) { - return false; -} - -void -AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<MachineFunctionAnalysis>(); - FunctionPass::getAnalysisUsage(AU); - AU.setPreservesAll(); -} - -size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) { - size_t size = 0; - if (!T) { - return size; - } - switch (T->getTypeID()) { - case Type::X86_FP80TyID: - case Type::FP128TyID: - case Type::PPC_FP128TyID: - case Type::LabelTyID: - assert(0 && "These types are not supported by this backend"); - default: - case Type::FloatTyID: - case Type::DoubleTyID: - size = T->getPrimitiveSizeInBits() >> 3; - break; - case Type::PointerTyID: - size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr); - break; - case Type::IntegerTyID: - size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr); - break; - case Type::StructTyID: - size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr); - break; - case Type::ArrayTyID: - size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr); - break; - case Type::FunctionTyID: - size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr); - break; - case Type::VectorTyID: - size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr); - break; - }; - return size; -} - -size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST, - bool dereferencePtr) { - size_t size = 0; - if (!ST) { - return size; - } - Type *curType; - StructType::element_iterator eib; - StructType::element_iterator eie; - for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) { - curType = *eib; - size += getTypeSize(curType, dereferencePtr); - } - return size; -} - -size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT, - bool dereferencePtr) { - return IT ? (IT->getBitWidth() >> 3) : 0; -} - -size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT, - bool dereferencePtr) { - assert(0 && "Should not be able to calculate the size of an function type"); - return 0; -} - -size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT, - bool dereferencePtr) { - return (size_t)(AT ? (getTypeSize(AT->getElementType(), - dereferencePtr) * AT->getNumElements()) - : 0); -} - -size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT, - bool dereferencePtr) { - return VT ? (VT->getBitWidth() >> 3) : 0; -} - -size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT, - bool dereferencePtr) { - if (!PT) { - return 0; - } - Type *CT = PT->getElementType(); - if (CT->getTypeID() == Type::StructTyID && - PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) { - return getTypeSize(dyn_cast<StructType>(CT)); - } else if (dereferencePtr) { - size_t size = 0; - for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) { - size += getTypeSize(PT->getContainedType(x), dereferencePtr); - } - return size; - } else { - return 4; - } -} - -size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT, - bool dereferencePtr) { - //assert(0 && "Should not be able to calculate the size of an opaque type"); - return 4; -} diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index 2ad2047278..97f0a40c29 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -21,7 +21,6 @@ add_llvm_target(R600CodeGen AMDILISelDAGToDAG.cpp AMDILISelLowering.cpp AMDILNIDevice.cpp - AMDILPeepholeOptimizer.cpp AMDILSIDevice.cpp AMDGPUAsmPrinter.cpp AMDGPUFrameLowering.cpp diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp index 45d009c2a0..61d70bb342 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp @@ -78,7 +78,7 @@ static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII, if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) { return createSIMCCodeEmitter(MCII, MRI, STI, Ctx); } else { - return createR600MCCodeEmitter(MCII, MRI, STI, Ctx); + return createR600MCCodeEmitter(MCII, MRI, STI); } } diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h index 09d0d5b61c..abb032045b 100644 --- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h +++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h @@ -33,8 +33,7 @@ extern Target TheAMDGPUTarget; MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, - MCContext &Ctx); + const MCSubtargetInfo &STI); MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp index 271a974734..cb4cf0ce38 100644 --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp @@ -36,13 +36,12 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter { const MCInstrInfo &MCII; const MCRegisterInfo &MRI; const MCSubtargetInfo &STI; - MCContext &Ctx; public: R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri, - const MCSubtargetInfo &sti, MCContext &ctx) - : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { } + const MCSubtargetInfo &sti) + : MCII(mcii), MRI(mri), STI(sti) { } /// \brief Encode the instruction and write it to the OS. virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS, @@ -99,9 +98,8 @@ enum TextureTypes { MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI, - const MCSubtargetInfo &STI, - MCContext &Ctx) { - return new R600MCCodeEmitter(MCII, MRI, STI, Ctx); + const MCSubtargetInfo &STI) { + return new R600MCCodeEmitter(MCII, MRI, STI); } void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, @@ -181,6 +179,13 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS, Emit((u_int32_t) 0, OS); } else { uint64_t Inst = getBinaryCodeForInstr(MI, Fixups); + if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) && + ((Desc.TSFlags & R600_InstFlag::OP1) || + Desc.TSFlags & R600_InstFlag::OP2)) { + uint64_t ISAOpCode = Inst & (0x3FFULL << 39); + Inst &= ~(0x3FFULL << 39); + Inst |= ISAOpCode << 1; + } Emit(Inst, OS); } } diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td index 5ee1c0d8ae..0cbe919d81 100644 --- a/lib/Target/R600/Processors.td +++ b/lib/Target/R600/Processors.td @@ -45,3 +45,4 @@ def : Proc<"tahiti", SI_Itin, [Feature64BitPtr, FeatureFP64]>; def : Proc<"pitcairn", SI_Itin, [Feature64BitPtr, FeatureFP64]>; def : Proc<"verde", SI_Itin, [Feature64BitPtr, FeatureFP64]>; def : Proc<"oland", SI_Itin, [Feature64BitPtr, FeatureFP64]>; +def : Proc<"hainan", SI_Itin, [Feature64BitPtr, FeatureFP64]>; diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp index cdda3dab8d..ffe3414413 100644 --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp @@ -325,7 +325,7 @@ public: virtual bool runOnMachineFunction(MachineFunction &MF) { unsigned MaxStack = 0; unsigned CurrentStack = 0; - bool hasPush; + bool HasPush = false; for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME; ++MB) { MachineBasicBlock &MBB = *MB; @@ -337,6 +337,7 @@ public: BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()), getHWInstrDesc(CF_CALL_FS)); CfCount++; + MaxStack = 1; } std::vector<ClauseFile> FetchClauses, AluClauses; for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); @@ -354,7 +355,7 @@ public: case AMDGPU::CF_ALU_PUSH_BEFORE: CurrentStack++; MaxStack = std::max(MaxStack, CurrentStack); - hasPush = true; + HasPush = true; case AMDGPU::CF_ALU: I = MI; AluClauses.push_back(MakeALUClause(MBB, I)); @@ -475,7 +476,7 @@ public: break; } } - MFI->StackSize = getHWStackSize(MaxStack, hasPush); + MFI->StackSize = getHWStackSize(MaxStack, HasPush); } return false; diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index c6e2136ff4..7252235d5b 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -43,6 +43,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::AND, MVT::v4i32, Expand); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand); + setOperationAction(ISD::MUL, MVT::v2i32, Expand); + setOperationAction(ISD::MUL, MVT::v4i32, Expand); setOperationAction(ISD::OR, MVT::v4i32, Expand); setOperationAction(ISD::OR, MVT::v2i32, Expand); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand); @@ -50,6 +52,10 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SHL, MVT::v2i32, Expand); setOperationAction(ISD::SRL, MVT::v4i32, Expand); setOperationAction(ISD::SRL, MVT::v2i32, Expand); + setOperationAction(ISD::SRA, MVT::v4i32, Expand); + setOperationAction(ISD::SRA, MVT::v2i32, Expand); + setOperationAction(ISD::SUB, MVT::v4i32, Expand); + setOperationAction(ISD::SUB, MVT::v2i32, Expand); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand); setOperationAction(ISD::UDIV, MVT::v4i32, Expand); setOperationAction(ISD::UREM, MVT::v4i32, Expand); @@ -78,6 +84,9 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) : setOperationAction(ISD::SELECT, MVT::i32, Custom); setOperationAction(ISD::SELECT, MVT::f32, Custom); + setOperationAction(ISD::VSELECT, MVT::v4i32, Expand); + setOperationAction(ISD::VSELECT, MVT::v2i32, Expand); + // Legalize loads and stores to the private address space. setOperationAction(ISD::LOAD, MVT::i32, Custom); setOperationAction(ISD::LOAD, MVT::v2i32, Custom); diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 8a60add450..8f47523524 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -1615,6 +1615,7 @@ let Predicates = [isEGorCayman] in { i32:$src2))], VecALU >; + def : BFEPattern <BFE_UINT_eg>; def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>; defm : BFIPatterns <BFI_INT_eg>; diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index 16e1e42c50..42b4e73509 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -369,7 +369,14 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF, /// getCompactUnwindRegNum - Get the compact unwind number for a given /// register. The number corresponds to the enum lists in /// compact_unwind_encoding.h. -static int getCompactUnwindRegNum(const uint16_t *CURegs, unsigned Reg) { +static int getCompactUnwindRegNum(unsigned Reg, bool is64Bit) { + static const uint16_t CU32BitRegs[] = { + X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 + }; + static const uint16_t CU64BitRegs[] = { + X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 + }; + const uint16_t *CURegs = is64Bit ? CU64BitRegs : CU32BitRegs; for (int Idx = 1; *CURegs; ++CURegs, ++Idx) if (*CURegs == Reg) return Idx; @@ -398,16 +405,8 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], // 4 3 // 5 3 // - static const uint16_t CU32BitRegs[] = { - X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 - }; - static const uint16_t CU64BitRegs[] = { - X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 - }; - const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs); - for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) { - int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]); + int CUReg = getCompactUnwindRegNum(SavedRegs[i], Is64Bit); if (CUReg == -1) return ~0U; SavedRegs[i] = CUReg; } @@ -466,14 +465,6 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], static uint32_t encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], bool Is64Bit) { - static const uint16_t CU32BitRegs[] = { - X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0 - }; - static const uint16_t CU64BitRegs[] = { - X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0 - }; - const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs); - // Encode the registers in the order they were saved, 3-bits per register. The // registers are numbered from 1 to CU_NUM_SAVED_REGS. uint32_t RegEnc = 0; @@ -481,7 +472,7 @@ encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS], unsigned Reg = SavedRegs[I]; if (Reg == 0) continue; - int CURegNum = getCompactUnwindRegNum(CURegs, Reg); + int CURegNum = getCompactUnwindRegNum(Reg, Is64Bit); if (CURegNum == -1) return ~0U; // Encode the 3-bit register number in order, skipping over 3-bits for each @@ -534,6 +525,12 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const { // If there are too many saved registers, we cannot use compact encoding. if (SavedRegIdx >= CU_NUM_SAVED_REGS) return CU::UNWIND_MODE_DWARF; + unsigned Reg = MI.getOperand(0).getReg(); + if (Reg == (Is64Bit ? X86::RAX : X86::EAX)) { + ExpectEnd = true; + continue; + } + SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg(); StackAdjust += OffsetSize; InstrOffset += PushInstrSize; diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 45fe69aa29..f69f5d85f7 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -9336,29 +9336,31 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, if (Swap) std::swap(Op0, Op1); - // Since SSE has no unsigned integer comparisons, we need to flip the sign - // bits of the inputs before performing those operations. - if (FlipSigns) { - EVT EltVT = VT.getVectorElementType(); - SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), - EltVT); - std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit); - SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0], - SignBits.size()); - Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec); - Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec); - } - // Check that the operation in question is available (most are plain SSE2, // but PCMPGTQ and PCMPEQQ have different requirements). if (VT == MVT::v2i64) { if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) { assert(Subtarget->hasSSE2() && "Don't know how to lower!"); - // First cast everything to the right type, + // First cast everything to the right type. Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); + // Since SSE has no unsigned integer comparisons, we need to flip the sign + // bits of the inputs before performing those operations. The lower + // compare is always unsigned. + SDValue SB; + if (FlipSigns) { + SB = DAG.getConstant(0x80000000U, MVT::v4i32); + } else { + SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32); + SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32); + SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, + Sign, Zero, Sign, Zero); + } + Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB); + Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB); + // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); @@ -9384,7 +9386,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, // pcmpeqd + pshufd + pand. assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!"); - // First cast everything to the right type, + // First cast everything to the right type. Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0); Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1); @@ -9403,6 +9405,15 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget, } } + // Since SSE has no unsigned integer comparisons, we need to flip the sign + // bits of the inputs before performing those operations. + if (FlipSigns) { + EVT EltVT = VT.getVectorElementType(); + SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT); + Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB); + Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB); + } + SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); // If the logical-not of the result is required, perform that now. diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index ad4a6c7c41..08d372512d 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -80,6 +80,7 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/PatternMatch.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Support/ValueHandle.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" @@ -382,7 +383,7 @@ public: // The starting value of the reduction. // It does not have to be zero! - Value *StartValue; + TrackingVH<Value> StartValue; // The instruction who's value is used outside the loop. Instruction *LoopExitInstr; // The kind of the reduction. @@ -427,7 +428,7 @@ public: /// This flag indicates if we need to add the runtime check. bool Need; /// Holds the pointers that we need to check. - SmallVector<Value*, 2> Pointers; + SmallVector<TrackingVH<Value>, 2> Pointers; /// Holds the pointer value at the beginning of the loop. SmallVector<const SCEV*, 2> Starts; /// Holds the pointer value at the end of the loop. @@ -441,7 +442,7 @@ public: InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {} InductionInfo() : StartValue(0), IK(IK_NoInduction) {} /// Start value. - Value *StartValue; + TrackingVH<Value> StartValue; /// Induction kind. InductionKind IK; }; @@ -2307,7 +2308,10 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() { } bool LoopVectorizationLegality::canVectorize() { - assert(TheLoop->getLoopPreheader() && "No preheader!!"); + // We must have a loop in canonical form. Loops with indirectbr in them cannot + // be canonicalized. + if (!TheLoop->getLoopPreheader()) + return false; // We can only vectorize innermost loops. if (TheLoop->getSubLoopsVector().size()) @@ -2374,6 +2378,26 @@ bool LoopVectorizationLegality::canVectorize() { return true; } +/// \brief Check that the instruction has outside loop users and is not an +/// identified reduction variable. +static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst, + SmallPtrSet<Value *, 4> &Reductions) { + // Reduction instructions are allowed to have exit users. All other + // instructions must not have external users. + if (!Reductions.count(Inst)) + //Check that all of the users of the loop are inside the BB. + for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end(); + I != E; ++I) { + Instruction *U = cast<Instruction>(*I); + // This user may be a reduction exit value. + if (!TheLoop->contains(U)) { + DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + return true; + } + } + return false; +} + bool LoopVectorizationLegality::canVectorizeInstrs() { BasicBlock *PreHeader = TheLoop->getLoopPreheader(); BasicBlock *Header = TheLoop->getHeader(); @@ -2412,8 +2436,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // If this PHINode is not in the header block, then we know that we // can convert it to select during if-conversion. No need to check if // the PHIs in this block are induction or reduction variables. - if (*bb != Header) - continue; + if (*bb != Header) { + // Check that this instruction has no outside users or is an + // identified reduction value with an outside user. + if(!hasOutsideLoopUser(TheLoop, it, AllowedExit)) + continue; + return false; + } // We only allow if-converted PHIs with more than two incoming values. if (Phi->getNumIncomingValues() != 2) { @@ -2506,17 +2535,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() { // Reduction instructions are allowed to have exit users. // All other instructions must not have external users. - if (!AllowedExit.count(it)) - //Check that all of the users of the loop are inside the BB. - for (Value::use_iterator I = it->use_begin(), E = it->use_end(); - I != E; ++I) { - Instruction *U = cast<Instruction>(*I); - // This user may be a reduction exit value. - if (!TheLoop->contains(U)) { - DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); - return false; - } - } + if (hasOutsideLoopUser(TheLoop, it, AllowedExit)) + return false; + } // next instr. } diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll index 74628f0c5c..eb5ad8f0c3 100644 --- a/test/CodeGen/ARM/vmul.ll +++ b/test/CodeGen/ARM/vmul.ll @@ -599,3 +599,27 @@ for.end179: ; preds = %for.cond.loopexit, declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone + +; vmull lowering would create a zext(v4i8 load()) instead of a zextload(v4i8), +; creating an illegal type during legalization and causing an assert. +; PR15970 +define void @no_illegal_types_vmull_sext(<4 x i32> %a) { +entry: + %wide.load283.i = load <4 x i8>* undef, align 1 + %0 = sext <4 x i8> %wide.load283.i to <4 x i32> + %1 = sub nsw <4 x i32> %0, %a + %2 = mul nsw <4 x i32> %1, %1 + %predphi290.v.i = select <4 x i1> undef, <4 x i32> undef, <4 x i32> %2 + store <4 x i32> %predphi290.v.i, <4 x i32>* undef, align 4 + ret void +} +define void @no_illegal_types_vmull_zext(<4 x i32> %a) { +entry: + %wide.load283.i = load <4 x i8>* undef, align 1 + %0 = zext <4 x i8> %wide.load283.i to <4 x i32> + %1 = sub nsw <4 x i32> %0, %a + %2 = mul nsw <4 x i32> %1, %1 + %predphi290.v.i = select <4 x i1> undef, <4 x i32> undef, <4 x i32> %2 + store <4 x i32> %predphi290.v.i, <4 x i32>* undef, align 4 + ret void +} diff --git a/test/CodeGen/Generic/annotate.ll b/test/CodeGen/Generic/annotate.ll new file mode 100644 index 0000000000..c617eb0925 --- /dev/null +++ b/test/CodeGen/Generic/annotate.ll @@ -0,0 +1,15 @@ +; RUN: llc < %s + +; PR15253 + +@.str = private unnamed_addr constant [4 x i8] c"sth\00", section "llvm.metadata" +@.str1 = private unnamed_addr constant [4 x i8] c"t.c\00", section "llvm.metadata" + + +define i32 @foo(i32 %a) { +entry: + %0 = call i32 @llvm.annotation.i32(i32 %a, i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i32 0, i32 0), i32 2) + ret i32 %0 +} + +declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1 diff --git a/test/CodeGen/Generic/ptr-annotate.ll b/test/CodeGen/Generic/ptr-annotate.ll new file mode 100644 index 0000000000..ac5bd5533e --- /dev/null +++ b/test/CodeGen/Generic/ptr-annotate.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s + +; PR15253 + +%struct.mystruct = type { i32 } + +@.str = private unnamed_addr constant [4 x i8] c"sth\00", section "llvm.metadata" +@.str1 = private unnamed_addr constant [4 x i8] c"t.c\00", section "llvm.metadata" + +define void @foo() { +entry: + %m = alloca i8, align 4 + %0 = call i8* @llvm.ptr.annotation.p0i8(i8* %m, i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i32 0, i32 0), i32 2) + store i8 1, i8* %0, align 4 + ret void +} + +declare i8* @llvm.ptr.annotation.p0i8(i8*, i8*, i8*, i32) #1 diff --git a/test/CodeGen/NVPTX/generic-to-nvvm.ll b/test/CodeGen/NVPTX/generic-to-nvvm.ll new file mode 100644 index 0000000000..c9cb2f71f4 --- /dev/null +++ b/test/CodeGen/NVPTX/generic-to-nvvm.ll @@ -0,0 +1,25 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +; Ensure global variables in address space 0 are promoted to address space 1 + +; CHECK: .global .align 4 .u32 myglobal = 42; +@myglobal = internal global i32 42, align 4 +; CHECK: .global .align 4 .u32 myconst = 42; +@myconst = internal constant i32 42, align 4 + + +define void @foo(i32* %a, i32* %b) { +; CHECK: cvta.global.u32 + %ld1 = load i32* @myglobal +; CHECK: cvta.global.u32 + %ld2 = load i32* @myconst + store i32 %ld1, i32* %a + store i32 %ld2, i32* %b + ret void +} + + +!nvvm.annotations = !{!0} +!0 = metadata !{void (i32*, i32*)* @foo, metadata !"kernel", i32 1} diff --git a/test/CodeGen/NVPTX/i1-global.ll b/test/CodeGen/NVPTX/i1-global.ll new file mode 100644 index 0000000000..0595325977 --- /dev/null +++ b/test/CodeGen/NVPTX/i1-global.ll @@ -0,0 +1,19 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + + +; CHECK: .visible .global .align 1 .u8 mypred +@mypred = addrspace(1) global i1 true, align 1 + + +define void @foo(i1 %p, i32* %out) { + %ld = load i1 addrspace(1)* @mypred + %val = zext i1 %ld to i32 + store i32 %val, i32* %out + ret void +} + + +!nvvm.annotations = !{!0} +!0 = metadata !{void (i1, i32*)* @foo, metadata !"kernel", i32 1} diff --git a/test/CodeGen/NVPTX/i1-param.ll b/test/CodeGen/NVPTX/i1-param.ll new file mode 100644 index 0000000000..fabd61a25d --- /dev/null +++ b/test/CodeGen/NVPTX/i1-param.ll @@ -0,0 +1,18 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +; Make sure predicate (i1) operands to kernels get expanded out to .u8 + +; CHECK: .entry foo +; CHECK: .param .u8 foo_param_0 +; CHECK: .param .u32 foo_param_1 +define void @foo(i1 %p, i32* %out) { + %val = zext i1 %p to i32 + store i32 %val, i32* %out + ret void +} + + +!nvvm.annotations = !{!0} +!0 = metadata !{void (i1, i32*)* @foo, metadata !"kernel", i32 1} diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll index 8b0357be87..1676f20643 100644 --- a/test/CodeGen/NVPTX/intrinsics.ll +++ b/test/CodeGen/NVPTX/intrinsics.ll @@ -15,5 +15,12 @@ define ptx_device double @test_fabs(double %d) { ret double %x } +define float @test_nvvm_sqrt(float %a) { + %val = call float @llvm.nvvm.sqrt.f(float %a) + ret float %val +} + + declare float @llvm.fabs.f32(float) declare double @llvm.fabs.f64(double) +declare float @llvm.nvvm.sqrt.f(float) diff --git a/test/CodeGen/NVPTX/refl1.ll b/test/CodeGen/NVPTX/refl1.ll new file mode 100644 index 0000000000..5a9dac152e --- /dev/null +++ b/test/CodeGen/NVPTX/refl1.ll @@ -0,0 +1,37 @@ +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s + +; Function Attrs: nounwind +; CHECK: .entry foo +define void @foo(float* nocapture %a) #0 { + %val = load float* %a + %tan = tail call fastcc float @__nv_fast_tanf(float %val) + store float %tan, float* %a + ret void +} + +; Function Attrs: nounwind readnone +declare float @llvm.nvvm.sin.approx.ftz.f(float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.nvvm.cos.approx.ftz.f(float) #1 + +; Function Attrs: nounwind readnone +declare float @llvm.nvvm.div.approx.ftz.f(float, float) #1 + +; Function Attrs: alwaysinline inlinehint nounwind readnone +; CHECK: .func (.param .b32 func_retval0) __nv_fast_tanf +define internal fastcc float @__nv_fast_tanf(float %a) #2 { +entry: + %0 = tail call float @llvm.nvvm.sin.approx.ftz.f(float %a) + %1 = tail call float @llvm.nvvm.cos.approx.ftz.f(float %a) + %2 = tail call float @llvm.nvvm.div.approx.ftz.f(float %0, float %1) + ret float %2 +} + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { alwaysinline inlinehint nounwind readnone } + +!nvvm.annotations = !{!0} + +!0 = metadata !{void (float*)* @foo, metadata !"kernel", i32 1} diff --git a/test/CodeGen/R600/bfe_uint.ll b/test/CodeGen/R600/bfe_uint.ll new file mode 100644 index 0000000000..92570c3152 --- /dev/null +++ b/test/CodeGen/R600/bfe_uint.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: @bfe_def +; CHECK: BFE_UINT +define void @bfe_def(i32 addrspace(1)* %out, i32 %x) { +entry: + %0 = lshr i32 %x, 5 + %1 = and i32 %0, 15 ; 0xf + store i32 %1, i32 addrspace(1)* %out + ret void +} + +; This program could be implemented using a BFE_UINT instruction, however +; since the lshr constant + number of bits in the mask is >= 32, it can also be +; implmented with a LSHR instruction, which is better, because LSHR has less +; operands and requires less constants. + +; CHECK: @bfe_shift +; CHECK-NOT: BFE_UINT +define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) { +entry: + %0 = lshr i32 %x, 16 + %1 = and i32 %0, 65535 ; 0xffff + store i32 %1, i32 addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll new file mode 100644 index 0000000000..7278e90398 --- /dev/null +++ b/test/CodeGen/R600/mul.ll @@ -0,0 +1,16 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; mul24 and mad24 are affected +;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32> addrspace(1) * %in + %b = load <4 x i32> addrspace(1) * %b_ptr + %result = mul <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/r600-encoding.ll b/test/CodeGen/R600/r600-encoding.ll new file mode 100644 index 0000000000..c8040a1b4c --- /dev/null +++ b/test/CodeGen/R600/r600-encoding.ll @@ -0,0 +1,24 @@ +; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s +; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rs880 | FileCheck --check-prefix=R600-CHECK %s + +; The earliest R600 GPUs have a slightly different encoding than the rest of +; the VLIW4/5 GPUs. + +; EG-CHECK: @test +; EG-CHECK: MUL_IEEE {{[ *TXYZW.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}] + +; R600-CHECK: @test +; R600-CHECK: MUL_IEEE {{[ *TXYZW.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}] + +define void @test() { +entry: + %0 = call float @llvm.R600.load.input(i32 0) + %1 = call float @llvm.R600.load.input(i32 1) + %2 = fmul float %0, %1 + call void @llvm.AMDGPU.store.output(float %2, i32 0) + ret void +} + +declare float @llvm.R600.load.input(i32) readnone + +declare void @llvm.AMDGPU.store.output(float, i32) diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll new file mode 100644 index 0000000000..972542d346 --- /dev/null +++ b/test/CodeGen/R600/sra.ll @@ -0,0 +1,13 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: @ashr_v4i32 +; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) { + %result = ashr <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll new file mode 100644 index 0000000000..12bfba3975 --- /dev/null +++ b/test/CodeGen/R600/sub.ll @@ -0,0 +1,15 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +;CHECK: SUB_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +;CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) { + %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1 + %a = load <4 x i32> addrspace(1) * %in + %b = load <4 x i32> addrspace(1) * %b_ptr + %result = sub <4 x i32> %a, %b + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/R600/vselect.ll b/test/CodeGen/R600/vselect.ll new file mode 100644 index 0000000000..6e459df847 --- /dev/null +++ b/test/CodeGen/R600/vselect.ll @@ -0,0 +1,17 @@ +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s + +; CHECK: @test_select_v4i32 +; CHECK: CNDE_INT T{{[0-9]+\.[XYZW], PV\.[xyzw], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: CNDE_INT * T{{[0-9]+\.[XYZW], PV\.[xyzw], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: CNDE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} +; CHECK: CNDE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} + +define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) { +entry: + %0 = load <4 x i32> addrspace(1)* %in0 + %1 = load <4 x i32> addrspace(1)* %in1 + %cmp = icmp ne <4 x i32> %0, %1 + %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1 + store <4 x i32> %result, <4 x i32> addrspace(1)* %out + ret void +} diff --git a/test/CodeGen/X86/compact-unwind.ll b/test/CodeGen/X86/compact-unwind.ll new file mode 100644 index 0000000000..8c4fa27da5 --- /dev/null +++ b/test/CodeGen/X86/compact-unwind.ll @@ -0,0 +1,30 @@ +; RUN: llc < %s -disable-cfi -disable-fp-elim -mtriple x86_64-apple-darwin11 | FileCheck %s + +%ty = type { i8* } + +@gv = external global i32 + +; This is aligning the stack with a push of a random register. +; CHECK: pushq %rax + +; Even though we can't encode %rax into the compact unwind, We still want to be +; able to generate a compact unwind encoding in this particular case. +; +; CHECK: __LD,__compact_unwind +; CHECK: _foo ## Range Start +; CHECK: 16842753 ## Compact Unwind Encoding: 0x1010001 + +define i8* @foo(i64 %size) { + %addr = alloca i64, align 8 + %tmp20 = load i32* @gv, align 4 + %tmp21 = call i32 @bar() + %tmp25 = load i64* %addr, align 8 + %tmp26 = inttoptr i64 %tmp25 to %ty* + %tmp29 = getelementptr inbounds %ty* %tmp26, i64 0, i32 0 + %tmp34 = load i8** %tmp29, align 8 + %tmp35 = getelementptr inbounds i8* %tmp34, i64 %size + store i8* %tmp35, i8** %tmp29, align 8 + ret i8* null +} + +declare i32 @bar() diff --git a/test/CodeGen/X86/vec_compare.ll b/test/CodeGen/X86/vec_compare.ll index 85d8b2cea3..fd5c234bb1 100644 --- a/test/CodeGen/X86/vec_compare.ll +++ b/test/CodeGen/X86/vec_compare.ll @@ -67,7 +67,15 @@ define <2 x i64> @test6(<2 x i64> %A, <2 x i64> %B) nounwind { } define <2 x i64> @test7(<2 x i64> %A, <2 x i64> %B) nounwind { +; CHECK: [[CONSTSEG:[A-Z0-9_]*]]: +; CHECK: .long 2147483648 +; CHECK-NEXT: .long 0 +; CHECK-NEXT: .long 2147483648 +; CHECK-NEXT: .long 0 ; CHECK: test7: +; CHECK: movdqa [[CONSTSEG]], [[CONSTREG:%xmm[0-9]*]] +; CHECK: pxor [[CONSTREG]] +; CHECK: pxor [[CONSTREG]] ; CHECK: pcmpgtd %xmm1 ; CHECK: pshufd $-96 ; CHECK: pcmpeqd @@ -83,6 +91,8 @@ define <2 x i64> @test7(<2 x i64> %A, <2 x i64> %B) nounwind { define <2 x i64> @test8(<2 x i64> %A, <2 x i64> %B) nounwind { ; CHECK: test8: +; CHECK: pxor +; CHECK: pxor ; CHECK: pcmpgtd %xmm0 ; CHECK: pshufd $-96 ; CHECK: pcmpeqd @@ -98,6 +108,8 @@ define <2 x i64> @test8(<2 x i64> %A, <2 x i64> %B) nounwind { define <2 x i64> @test9(<2 x i64> %A, <2 x i64> %B) nounwind { ; CHECK: test9: +; CHECK: pxor +; CHECK: pxor ; CHECK: pcmpgtd %xmm0 ; CHECK: pshufd $-96 ; CHECK: pcmpeqd @@ -115,6 +127,8 @@ define <2 x i64> @test9(<2 x i64> %A, <2 x i64> %B) nounwind { define <2 x i64> @test10(<2 x i64> %A, <2 x i64> %B) nounwind { ; CHECK: test10: +; CHECK: pxor +; CHECK: pxor ; CHECK: pcmpgtd %xmm1 ; CHECK: pshufd $-96 ; CHECK: pcmpeqd @@ -131,9 +145,15 @@ define <2 x i64> @test10(<2 x i64> %A, <2 x i64> %B) nounwind { } define <2 x i64> @test11(<2 x i64> %A, <2 x i64> %B) nounwind { +; CHECK: [[CONSTSEG:[A-Z0-9_]*]]: +; CHECK: .long 2147483648 +; CHECK-NEXT: .long 2147483648 +; CHECK-NEXT: .long 2147483648 +; CHECK-NEXT: .long 2147483648 ; CHECK: test11: -; CHECK: pxor -; CHECK: pxor +; CHECK: movdqa [[CONSTSEG]], [[CONSTREG:%xmm[0-9]*]] +; CHECK: pxor [[CONSTREG]] +; CHECK: pxor [[CONSTREG]] ; CHECK: pcmpgtd %xmm1 ; CHECK: pshufd $-96 ; CHECK: pcmpeqd diff --git a/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll index e7af892c10..620478a879 100644 --- a/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll +++ b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll @@ -1,5 +1,6 @@ ; RUN: llc -O0 %s -mtriple=x86_64-apple-darwin -filetype=obj -o %t ; RUN: llvm-dwarfdump %t | FileCheck %s +; RUN: llc < %s -O0 -mtriple=x86_64-apple-macosx10.7 | FileCheck %s -check-prefix=ASM ; rdar://13067005 ; CHECK: .debug_info contents: @@ -20,6 +21,11 @@ ; CHECK: file_names[ 1] 0 0x00000000 0x00000000 simple2.c ; CHECK-NOT: file_names +; PR15408 +; ASM: L__DWARF__debug_info_begin0: +; ASM: .long 0 ## DW_AT_stmt_list +; ASM: L__DWARF__debug_info_begin1: +; ASM: .long 0 ## DW_AT_stmt_list define i32 @test(i32 %a) nounwind uwtable ssp { entry: %a.addr = alloca i32, align 4 diff --git a/test/ExecutionEngine/MCJIT/eh.ll b/test/ExecutionEngine/MCJIT/eh.ll index 0c19b1bf2e..c2135736ad 100644 --- a/test/ExecutionEngine/MCJIT/eh.ll +++ b/test/ExecutionEngine/MCJIT/eh.ll @@ -1,5 +1,5 @@ ; RUN: %lli_mcjit %s -; XFAIL: arm, cygwin +; XFAIL: arm, cygwin, win32, mingw declare i8* @__cxa_allocate_exception(i64) declare void @__cxa_throw(i8*, i8*, i8*) declare i32 @__gxx_personality_v0(...) diff --git a/test/MC/AsmParser/exprs.s b/test/MC/AsmParser/exprs.s index df075f85ec..a7e10020b6 100644 --- a/test/MC/AsmParser/exprs.s +++ b/test/MC/AsmParser/exprs.s @@ -45,6 +45,7 @@ k: check_expr 0 || 0, 0 check_expr 1 + 2 < 3 + 4, 1 check_expr 1 << 8 - 1, 128 + check_expr 3 * 9 - 2 * 9 + 1, 10 .set c, 10 check_expr c + 1, 11 diff --git a/test/Transforms/LoopUnroll/scevunroll.ll b/test/Transforms/LoopUnroll/scevunroll.ll index 99b3a7d861..308a036316 100644 --- a/test/Transforms/LoopUnroll/scevunroll.ll +++ b/test/Transforms/LoopUnroll/scevunroll.ll @@ -66,13 +66,16 @@ exit2: ; SCEV properly unrolls multi-exit loops. ; +; SCEV cannot currently unroll this loop. +; It should ideally detect a trip count of 5. +; rdar:14038809 [SCEV]: Optimize trip count computation for multi-exit loops. ; CHECK: @multiExit -; CHECK: getelementptr i32* %base, i32 10 -; CHECK-NEXT: load i32* -; CHECK: br i1 false, label %l2.10, label %exit1 -; CHECK: l2.10: -; CHECK-NOT: br -; CHECK: ret i32 +; CHECKFIXME: getelementptr i32* %base, i32 10 +; CHECKFIXME-NEXT: load i32* +; CHECKFIXME: br i1 false, label %l2.10, label %exit1 +; CHECKFIXME: l2.10: +; CHECKFIXME-NOT: br +; CHECKFIXME: ret i32 define i32 @multiExit(i32* %base) nounwind { entry: br label %l1 @@ -170,3 +173,38 @@ for.body87: br label %for.body87 } +; PR16130: clang produces incorrect code with loop/expression at -O2 +; rdar:14036816 loop-unroll makes assumptions about undefined behavior +; +; The loop latch is assumed to exit after the first iteration because +; of the induction variable's NSW flag. However, the loop latch's +; equality test is skipped and the loop exits after the second +; iteration via the early exit. So loop unrolling cannot assume that +; the loop latch's exit count of zero is an upper bound on the number +; of iterations. +; +; CHECK: @nsw_latch +; CHECK: for.body: +; CHECK: %b.03 = phi i32 [ 0, %entry ], [ %add, %for.cond ] +; CHECK: return: +; CHECK: %b.03.lcssa = phi i32 [ %b.03, %for.body ], [ %b.03, %for.cond ] +define void @nsw_latch(i32* %a) nounwind { +entry: + br label %for.body + +for.body: ; preds = %for.cond, %entry + %b.03 = phi i32 [ 0, %entry ], [ %add, %for.cond ] + %tobool = icmp eq i32 %b.03, 0 + %add = add nsw i32 %b.03, 8 + br i1 %tobool, label %for.cond, label %return + +for.cond: ; preds = %for.body + %cmp = icmp eq i32 %add, 13 + br i1 %cmp, label %return, label %for.body + +return: ; preds = %for.body, %for.cond + %b.03.lcssa = phi i32 [ %b.03, %for.body ], [ %b.03, %for.cond ] + %retval.0 = phi i32 [ 1, %for.body ], [ 0, %for.cond ] + store i32 %b.03.lcssa, i32* %a, align 4 + ret void +} diff --git a/test/Transforms/LoopUnroll/unloop.ll b/test/Transforms/LoopUnroll/unloop.ll index 5a9cacda44..9a938cc287 100644 --- a/test/Transforms/LoopUnroll/unloop.ll +++ b/test/Transforms/LoopUnroll/unloop.ll @@ -21,8 +21,8 @@ outer: inner: %iv = phi i32 [ 0, %outer ], [ %inc, %tail ] %inc = add i32 %iv, 1 - %wbucond = call zeroext i1 @check() - br i1 %wbucond, label %outer.backedge, label %tail + call zeroext i1 @check() + br i1 true, label %outer.backedge, label %tail tail: br i1 false, label %inner, label %exit @@ -126,25 +126,27 @@ return: ; Ensure that only the middle loop is removed and rely on verify-loopinfo to ; check soundness. ; -; CHECK: @unloopDeepNested +; This test must be disabled until trip count computation can be optimized... +; rdar:14038809 [SCEV]: Optimize trip count computation for multi-exit loops. +; CHECKFIXME: @unloopDeepNested ; Inner-inner loop control. -; CHECK: while.cond.us.i: -; CHECK: br i1 %cmp.us.i, label %next_data.exit, label %while.body.us.i -; CHECK: if.then.us.i: -; CHECK: br label %while.cond.us.i +; CHECKFIXME: while.cond.us.i: +; CHECKFIXME: br i1 %cmp.us.i, label %next_data.exit, label %while.body.us.i +; CHECKFIXME: if.then.us.i: +; CHECKFIXME: br label %while.cond.us.i ; Inner loop tail. -; CHECK: if.else.i: -; CHECK: br label %while.cond.outer.i +; CHECKFIXME: if.else.i: +; CHECKFIXME: br label %while.cond.outer.i ; Middle loop control (removed). -; CHECK: valid_data.exit: -; CHECK-NOT: br -; CHECK: %cmp = call zeroext i1 @check() +; CHECKFIXME: valid_data.exit: +; CHECKFIXME-NOT: br +; CHECKFIXME: %cmp = call zeroext i1 @check() ; Outer loop control. -; CHECK: copy_data.exit: -; CHECK: br i1 %cmp38, label %if.then39, label %while.cond.outer +; CHECKFIXME: copy_data.exit: +; CHECKFIXME: br i1 %cmp38, label %if.then39, label %while.cond.outer ; Outer-outer loop tail. -; CHECK: while.cond.outer.outer.backedge: -; CHECK: br label %while.cond.outer.outer +; CHECKFIXME: while.cond.outer.outer.backedge: +; CHECKFIXME: br label %while.cond.outer.outer define void @unloopDeepNested() nounwind { for.cond8.preheader.i: %cmp113.i = call zeroext i1 @check() diff --git a/test/Transforms/LoopVectorize/lcssa-crash.ll b/test/Transforms/LoopVectorize/lcssa-crash.ll index 06b3b08aa0..de6be54849 100644 --- a/test/Transforms/LoopVectorize/lcssa-crash.ll +++ b/test/Transforms/LoopVectorize/lcssa-crash.ll @@ -27,3 +27,14 @@ for.end.i.i.i: unreachable } +; PR16139 +define void @test2(i8* %x) { +entry: + indirectbr i8* %x, [ label %L0, label %L1 ] + +L0: + br label %L0 + +L1: + ret void +} diff --git a/test/Transforms/LoopVectorize/no_outside_user.ll b/test/Transforms/LoopVectorize/no_outside_user.ll new file mode 100644 index 0000000000..6f0357c5e5 --- /dev/null +++ b/test/Transforms/LoopVectorize/no_outside_user.ll @@ -0,0 +1,41 @@ +; RUN: opt -S -loop-vectorize -force-vector-unroll=1 -force-vector-width=2 < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128" + +@f = common global i32 0, align 4 +@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1 +@c = common global i32 0, align 4 +@a = common global i32 0, align 4 +@b = common global i32 0, align 4 +@e = common global i32 0, align 4 + +; We used to vectorize this loop. But it has a value that is used outside of the +; and is not a recognized reduction variable "tmp17". + +; CHECK-NOT: <2 x i32> + +define i32 @main() { +bb: + %b.promoted = load i32* @b, align 4 + br label %.lr.ph.i + +.lr.ph.i: + %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ] + %tmp2 = icmp sgt i32 %tmp8, 10 + br i1 %tmp2, label %bb16, label %bb10 + +bb10: + br label %bb16 + +bb16: + %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ] + %tmp18 = add nsw i32 %tmp8, 1 + %tmp19 = icmp slt i32 %tmp18, 4 + br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit + +f1.exit.loopexit: + %.lcssa = phi i32 [ %tmp17, %bb16 ] + ret i32 %.lcssa +} + + diff --git a/test/Transforms/LoopVectorize/value-ptr-bug.ll b/test/Transforms/LoopVectorize/value-ptr-bug.ll new file mode 100644 index 0000000000..f376656f07 --- /dev/null +++ b/test/Transforms/LoopVectorize/value-ptr-bug.ll @@ -0,0 +1,50 @@ +; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -dce -instcombine < %s | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" + +; PR16073 + +; Because we were caching value pointers accross a function call that could RAUW +; we would generate an undefined value store below: +; SCEVExpander::expandCodeFor would change a value (the start value of an +; induction) that we cached in the induction variable list. + +; CHECK: test_vh +; CHECK-NOT: store <4 x i8> undef + +define void @test_vh(i32* %ptr265, i32* %ptr266, i32 %sub267) { +entry: + br label %loop + +loop: + %inc = phi i32 [ %sub267, %entry ], [ %add, %loop] + %ext.inc = sext i32 %inc to i64 + %add.ptr265 = getelementptr inbounds i32* %ptr265, i64 %ext.inc + %add.ptr266 = getelementptr inbounds i32* %ptr266, i64 %ext.inc + %add = add i32 %inc, 9 + %cmp = icmp slt i32 %add, 140 + br i1 %cmp, label %block1, label %loop + +block1: + %sub267.lcssa = phi i32 [ %add, %loop ] + %add.ptr266.lcssa = phi i32* [ %add.ptr266, %loop ] + %add.ptr265.lcssa = phi i32* [ %add.ptr265, %loop ] + %tmp29 = bitcast i32* %add.ptr265.lcssa to i8* + %tmp30 = bitcast i32* %add.ptr266.lcssa to i8* + br label %do.body272 + +do.body272: + %row_width.5 = phi i32 [ %sub267.lcssa, %block1 ], [ %dec, %do.body272 ] + %sp.4 = phi i8* [ %tmp30, %block1 ], [ %incdec.ptr273, %do.body272 ] + %dp.addr.4 = phi i8* [ %tmp29, %block1 ], [ %incdec.ptr274, %do.body272 ] + %incdec.ptr273 = getelementptr inbounds i8* %sp.4, i64 1 + %tmp31 = load i8* %sp.4, align 1 + %incdec.ptr274 = getelementptr inbounds i8* %dp.addr.4, i64 1 + store i8 %tmp31, i8* %dp.addr.4, align 1 + %dec = add i32 %row_width.5, -1 + %cmp276 = icmp eq i32 %dec, 0 + br i1 %cmp276, label %loop.exit, label %do.body272 + +loop.exit: + ret void +} |