summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAbdoulaye Walsimou Gaye <awg@embtoolkit.org>2013-06-12 21:27:20 +0200
committerAbdoulaye Walsimou Gaye <awg@embtoolkit.org>2013-06-12 21:27:20 +0200
commit46ef71a74329a0777e3464c65927cd3e59c928d6 (patch)
treee8c06aeb70dcbe18acdf6db1f88a1452ac1dca68
parent32c493313c30abe0830e95dc92c7ad1c2241ba57 (diff)
parentce337502f5a88500df9ab2f59ab48f97be0b4395 (diff)
downloadllvm-46ef71a74329a0777e3464c65927cd3e59c928d6.tar.gz
llvm-46ef71a74329a0777e3464c65927cd3e59c928d6.tar.bz2
llvm-46ef71a74329a0777e3464c65927cd3e59c928d6.tar.xz
Merge branch 'release_33' of git://github.com/llvm-mirror/llvm into embtk-support-release-3.3
-rw-r--r--autoconf/configure.ac2
-rwxr-xr-xconfigure18
-rw-r--r--docs/CommandLine.rst2
-rw-r--r--docs/LangRef.rst9
-rw-r--r--docs/ReleaseNotes.rst226
-rw-r--r--include/llvm/Analysis/ScalarEvolution.h10
-rw-r--r--include/llvm/IR/IntrinsicsNVVM.td2
-rw-r--r--include/llvm/MC/MCELFObjectWriter.h9
-rw-r--r--lib/Analysis/ScalarEvolution.cpp105
-rw-r--r--lib/CodeGen/AsmPrinter/DwarfDebug.cpp17
-rw-r--r--lib/CodeGen/IntrinsicLowering.cpp6
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp5
-rw-r--r--lib/MC/MCParser/AsmParser.cpp2
-rw-r--r--lib/MC/MCStreamer.cpp1
-rw-r--r--lib/Target/ARM/ARMISelLowering.cpp57
-rw-r--r--lib/Target/NVPTX/CMakeLists.txt1
-rw-r--r--lib/Target/NVPTX/NVPTX.h4
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.cpp133
-rw-r--r--lib/Target/NVPTX/NVPTXAsmPrinter.h26
-rw-r--r--lib/Target/NVPTX/NVPTXGenericToNVVM.cpp436
-rw-r--r--lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp7
-rw-r--r--lib/Target/NVPTX/NVPTXISelDAGToDAG.h4
-rw-r--r--lib/Target/NVPTX/NVPTXInstrInfo.td3
-rw-r--r--lib/Target/NVPTX/NVPTXIntrinsics.td40
-rw-r--r--lib/Target/NVPTX/NVPTXTargetMachine.cpp8
-rw-r--r--lib/Target/NVPTX/NVVMReflect.cpp24
-rw-r--r--lib/Target/PowerPC/PPCBranchSelector.cpp12
-rw-r--r--lib/Target/R600/AMDGPUInstructions.td11
-rw-r--r--lib/Target/R600/AMDGPUTargetMachine.cpp1
-rw-r--r--lib/Target/R600/AMDILDeviceInfo.cpp3
-rw-r--r--lib/Target/R600/AMDILPeepholeOptimizer.cpp1215
-rw-r--r--lib/Target/R600/CMakeLists.txt1
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp2
-rw-r--r--lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h3
-rw-r--r--lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp17
-rw-r--r--lib/Target/R600/Processors.td1
-rw-r--r--lib/Target/R600/R600ControlFlowFinalizer.cpp7
-rw-r--r--lib/Target/R600/R600ISelLowering.cpp9
-rw-r--r--lib/Target/R600/R600Instructions.td1
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp35
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp41
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp55
-rw-r--r--test/CodeGen/ARM/vmul.ll24
-rw-r--r--test/CodeGen/Generic/annotate.ll15
-rw-r--r--test/CodeGen/Generic/ptr-annotate.ll18
-rw-r--r--test/CodeGen/NVPTX/generic-to-nvvm.ll25
-rw-r--r--test/CodeGen/NVPTX/i1-global.ll19
-rw-r--r--test/CodeGen/NVPTX/i1-param.ll18
-rw-r--r--test/CodeGen/NVPTX/intrinsics.ll7
-rw-r--r--test/CodeGen/NVPTX/refl1.ll37
-rw-r--r--test/CodeGen/R600/bfe_uint.ll26
-rw-r--r--test/CodeGen/R600/mul.ll16
-rw-r--r--test/CodeGen/R600/r600-encoding.ll24
-rw-r--r--test/CodeGen/R600/sra.ll13
-rw-r--r--test/CodeGen/R600/sub.ll15
-rw-r--r--test/CodeGen/R600/vselect.ll17
-rw-r--r--test/CodeGen/X86/compact-unwind.ll30
-rw-r--r--test/CodeGen/X86/vec_compare.ll24
-rw-r--r--test/DebugInfo/X86/stmt-list-multiple-compile-units.ll6
-rw-r--r--test/ExecutionEngine/MCJIT/eh.ll2
-rw-r--r--test/MC/AsmParser/exprs.s1
-rw-r--r--test/Transforms/LoopUnroll/scevunroll.ll50
-rw-r--r--test/Transforms/LoopUnroll/unloop.ll34
-rw-r--r--test/Transforms/LoopVectorize/lcssa-crash.ll11
-rw-r--r--test/Transforms/LoopVectorize/no_outside_user.ll41
-rw-r--r--test/Transforms/LoopVectorize/value-ptr-bug.ll50
66 files changed, 1554 insertions, 1540 deletions
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index c1efd31a62..ffd155d2e2 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -31,7 +31,7 @@ dnl===
dnl===-----------------------------------------------------------------------===
dnl Initialize autoconf and define the package name, version number and
dnl address for reporting bugs.
-AC_INIT([LLVM],[3.3svn],[http://llvm.org/bugs/])
+AC_INIT([LLVM],[3.3],[http://llvm.org/bugs/])
AC_DEFINE([LLVM_VERSION_MAJOR], [3], [Major version of the LLVM API])
AC_DEFINE([LLVM_VERSION_MINOR], [3], [Minor version of the LLVM API])
diff --git a/configure b/configure
index 2452f820ab..3045c3260f 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.60 for LLVM 3.3svn.
+# Generated by GNU Autoconf 2.60 for LLVM 3.3.
#
# Report bugs to <http://llvm.org/bugs/>.
#
@@ -561,8 +561,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
# Identity of this package.
PACKAGE_NAME='LLVM'
PACKAGE_TARNAME='llvm'
-PACKAGE_VERSION='3.3svn'
-PACKAGE_STRING='LLVM 3.3svn'
+PACKAGE_VERSION='3.3'
+PACKAGE_STRING='LLVM 3.3'
PACKAGE_BUGREPORT='http://llvm.org/bugs/'
ac_unique_file="lib/IR/Module.cpp"
@@ -1331,7 +1331,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures LLVM 3.3svn to adapt to many kinds of systems.
+\`configure' configures LLVM 3.3 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1397,7 +1397,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of LLVM 3.3svn:";;
+ short | recursive ) echo "Configuration of LLVM 3.3:";;
esac
cat <<\_ACEOF
@@ -1569,7 +1569,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-LLVM configure 3.3svn
+LLVM configure 3.3
generated by GNU Autoconf 2.60
Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1585,7 +1585,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by LLVM $as_me 3.3svn, which was
+It was created by LLVM $as_me 3.3, which was
generated by GNU Autoconf 2.60. Invocation command line was
$ $0 $@
@@ -23128,7 +23128,7 @@ exec 6>&1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by LLVM $as_me 3.3svn, which was
+This file was extended by LLVM $as_me 3.3, which was
generated by GNU Autoconf 2.60. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -23181,7 +23181,7 @@ Report bugs to <bug-autoconf@gnu.org>."
_ACEOF
cat >>$CONFIG_STATUS <<_ACEOF
ac_cs_version="\\
-LLVM config.status 3.3svn
+LLVM config.status 3.3
configured by $0, generated by GNU Autoconf 2.60,
with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
diff --git a/docs/CommandLine.rst b/docs/CommandLine.rst
index 263a025f69..9b77a98908 100644
--- a/docs/CommandLine.rst
+++ b/docs/CommandLine.rst
@@ -618,6 +618,8 @@ would yield the help output:
-help - display available options (-help-hidden for more)
-o <filename> - Specify output filename
+.. _grouping options into categories:
+
Grouping options into categories
--------------------------------
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 410f640776..7743ff06a0 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -2868,11 +2868,10 @@ All globals of this sort should have a section specified as
The '``llvm.used``' Global Variable
-----------------------------------
-The ``@llvm.used`` global is an array which has
- :ref:`appending linkage <linkage_appending>`. This array contains a list of
-pointers to global variables, functions and aliases which may optionally have a
-pointer cast formed of bitcast or getelementptr. For example, a legal
-use of it is:
+The ``@llvm.used`` global is an array which has :ref:`appending linkage
+<linkage_appending>`. This array contains a list of pointers to global
+variables, functions and aliases which may optionally have a pointer cast formed
+of bitcast or getelementptr. For example, a legal use of it is:
.. code-block:: llvm
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 7952cd5423..73b0abf628 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -5,12 +5,6 @@ LLVM 3.3 Release Notes
.. contents::
:local:
-.. warning::
- These are in-progress notes for the upcoming LLVM 3.3 release. You may
- prefer the `LLVM 3.2 Release Notes <http://llvm.org/releases/3.2/docs
- /ReleaseNotes.html>`_.
-
-
Introduction
============
@@ -34,13 +28,6 @@ page <http://llvm.org/releases/>`_.
Non-comprehensive list of changes in this release
=================================================
-.. NOTE
- For small 1-3 sentence descriptions, just add an entry at the end of
- this list. If your description won't fit comfortably in one bullet
- point (e.g. maybe you would like to give an example of the
- functionality, or simply have a lot to talk about), see the `NOTE` below
- for adding a new subsection.
-
* The CellSPU port has been removed. It can still be found in older versions.
* The IR-level extended linker APIs (for example, to link bitcode files out of
@@ -70,17 +57,15 @@ Non-comprehensive list of changes in this release
examples of the new syntax. The old syntax using register classes still
works, but it will be removed in a future LLVM release.
-* ... next change ...
-
-.. NOTE
- If you would like to document a larger change, then you can add a
- subsection about it right here. You can copy the following boilerplate
- and un-indent it (the indentation causes it to be inside this comment).
+* MCJIT now supports exception handling. Support for it in the old jit will be
+ removed in the 3.4 release.
- Special New Feature
- -------------------
+* Command line options can now be grouped into categories which are shown in
+ the output of ``-help``. See :ref:`grouping options into categories`.
- Makes programs 10x faster by doing Special New Thing.
+* The appearance of command line options in ``-help`` that are inherited by
+ linking with libraries that use the LLVM Command line support library can now
+ be modified at runtime. See :ref:`cl::getRegisteredOptions`.
AArch64 target
--------------
@@ -99,9 +84,59 @@ GNU-style thread local storage and inline assembly.
Hexagon Target
--------------
-- Removed support for legacy hexagonv2 and hexagonv3 processor
- architectures which are no longer in use. Currently supported
- architectures are hexagonv4 and hexagonv5.
+Removed support for legacy hexagonv2 and hexagonv3 processor architectures which
+are no longer in use. Currently supported architectures are hexagonv4 and
+hexagonv5.
+
+Mips target
+--------------
+
+New features and improvements:
+
+- Clang driver
+ - Support for Sourcery CodeBench Mips toolchain directories tree.
+ - Support for new command line options including:
+ - -mxgot/-mno-xgot
+ - -EL / -EB
+ - -mmicromips / -mno-micromips
+ - -msingle-float / -mdouble-float
+ - -mabi=32 (o32 abi) and -mabi=64 (n64 abi)
+ - Previously, options such as -mips16, -mmicromips, -mdsp and -mdspr2 were
+ not passed to the assembler. This issue has been fixed.
+
+- A number of changes have been made to improve the quality of DSP-ASE code
+ generation.
+ - Multiply and multiply-accumulate instructions can now use all four
+ accumulators.
+ - Instruction selection patterns have been added so that DSP instructions
+ are emitted without having to use builtins.
+
+- Delay slot filler pass can now search successor blocks for instructions to
+ fill delay slots (use option -disable-mips-df-succbb-search=false).
+
+PowerPC Target
+--------------
+
+New features and improvements:
+
+- PowerPC now supports an assembly parser.
+- Support added for thread-local storage. 64-bit ELF subtarget only.
+- Support added for medium and large code model (-mcmodel=medium,large).
+ Medium code model is now the default. 64-bit ELF subtarget only.
+- Improved register allocation (fewer reserved registers).
+- 64-bit atomic load and store are now supported.
+- Improved code generation for unaligned memory accesses of scalar types.
+- Improved performance of floating-point divide and square root
+ with -ffast-math.
+- Support for predicated returns.
+- Improved code generation for comparisons.
+- Support added for inline setjmp and longjmp.
+- Support added for many instructions introduced in PowerISA 2.04, 2.05,
+ and 2.06.
+- Improved spill code for vector registers.
+- Support added for -mno-altivec.
+- ABI compatibility fixes for complex parameters, 128-bit integer parameters,
+ and varargs functions. 64-bit ELF subtarget only.
Loop Vectorizer
---------------
@@ -126,16 +161,16 @@ SLP Vectorizer
--------------
LLVM now has a new SLP vectorizer. The new SLP vectorizer is not enabled by
-default but can be enabled using the clang flag -fslp-vectorize. The BB-vectorizer
-can also be enabled using the command line flag -fslp-vectorize-aggressive.
+default but can be enabled using the clang flag ``-fslp-vectorize``. The
+BB-vectorizer can also be enabled using the command line flag
+``-fslp-vectorize-aggressive``.
R600 Backend
------------
-The R600 backend was added in this release, it supports AMD GPUs
-(HD2XXX - HD7XXX). This backend is used in AMD's Open Source
-graphics / compute drivers which are developed as part of the `Mesa3D
-<http://www.mesa3d.org>`_ project.
+The R600 backend was added in this release, it supports AMD GPUs (HD2XXX -
+HD7XXX). This backend is used in AMD's Open Source graphics / compute drivers
+which are developed as part of the `Mesa3D <http://www.mesa3d.org>`_ project.
SystemZ/s390x Backend
---------------------
@@ -145,41 +180,130 @@ is restricted to GNU/Linux (GNU triplet s390x-linux-gnu) and requires
z10 or greater.
+Sub-project Status Update
+=========================
+
+In addition to the core LLVM 3.3 distribution of production-quality compiler
+infrastructure, the LLVM project includes sub-projects that use the LLVM core
+and share the same distribution license. This section provides updates on these
+sub-projects.
+
+
+DragonEgg: GCC front-ends, LLVM back-end
+----------------------------------------
+
+`DragonEgg <http://dragonegg.llvm.org/>`_ is a
+`GCC plugin <http://gcc.gnu.org/wiki/plugins>`_ that replaces GCC's optimizers
+and code generators with LLVM's. It works with gcc-4.5, 4.6, 4.7 and 4.8, can
+target the x86-32/x86-64 and ARM processor families, and has been successfully
+used on the Darwin, FreeBSD, KFreeBSD, Linux and OpenBSD platforms. It fully
+supports Ada, C, C++ and Fortran. It has partial support for Go, Java, Obj-C
+and Obj-C++. Note that gcc-4.6 is the best supported version, and that Ada in
+particular doesn't work well with gcc-4.7 and newer.
+
+The `3.3 release <http://llvm.org/apt/>`_ has the following notable changes.
+
+- supports gcc-4.8 (requires gcc-4.8.1 or newer)
+- object files can be written directly using LLVM's integrated assembler
+- produces saner debug info
+- bitfields can now contain arbitrary scalar types (useful for Ada)
+
+
+LLDB: Low Level Debugger
+------------------------
+
+`LLDB <http://lldb.llvm.org/>`_ is a ground-up implementation of a command-line
+debugger, as well as a debugger API that can be used from scripts and other
+applications. LLDB uses the following components of the LLVM core distribution
+to support the latest language features and target support:
+
+- the Clang parser for high-quality parsing of C, C++ and Objective C
+- the LLVM disassembler
+- the LLVM JIT compiler (MCJIT) for expression evaluation
+
+The `3.3 release <http://lldb.llvm.org/download.html>`_ has the following notable changes.
+
+Features now supported on Linux:
+
+- Debugging multi-threaded programs
+- Support for watchpoints
+- Process list, attach and fork
+- `vim integration <http://llvm.org/svn/llvm-project/lldb/branches/release_33/utils/vim-lldb/README>`_ for LLDB
+
+Portability:
+
+- Builds with cmake, ninja, auto-tools, clang 3.3 and gcc 4.6
+
+Linux Improvements:
+
+- Improved register support including vector registers
+- Basic debugging of i386 programs
+- Bug fixes for expression evaluation
+
+
External Open Source Projects Using LLVM 3.3
============================================
-An exciting aspect of LLVM is that it is used as an enabling technology for
-a lot of other language and tools projects. This section lists some of the
+An exciting aspect of LLVM is that it is used as an enabling technology for a
+lot of other language and tools projects. This section lists some of the
projects that have already been updated to work with LLVM 3.3.
Portable Computing Language (pocl)
----------------------------------
-In addition to producing an easily portable open source OpenCL
-implementation, another major goal of `pocl <http://pocl.sourceforge.net/>`_
-is improving performance portability of OpenCL programs with
-compiler optimizations, reducing the need for target-dependent manual
-optimizations. An important part of pocl is a set of LLVM passes used to
-statically parallelize multiple work-items with the kernel compiler, even in
-the presence of work-group barriers. This enables static parallelization of
-the fine-grained static concurrency in the work groups in multiple ways.
+In addition to producing an easily portable open source OpenCL implementation,
+another major goal of `pocl <http://pocl.sourceforge.net/>`_ is improving
+performance portability of OpenCL programs with compiler optimizations, reducing
+the need for target-dependent manual optimizations. An important part of pocl is
+a set of LLVM passes used to statically parallelize multiple work-items with the
+kernel compiler, even in the presence of work-group barriers. This enables
+static parallelization of the fine-grained static concurrency in the work groups
+in multiple ways.
TTA-based Co-design Environment (TCE)
-------------------------------------
-`TCE <http://tce.cs.tut.fi/>`_ is a toolset for designing new
-processors based on the Transport triggered architecture (TTA).
-The toolset provides a complete co-design flow from C/C++
-programs down to synthesizable VHDL/Verilog and parallel program binaries.
-Processor customization points include the register files, function units,
-supported operations, and the interconnection network.
+`TCE <http://tce.cs.tut.fi/>`_ is a toolset for designing new processors based
+on the Transport triggered architecture (TTA). The toolset provides a complete
+co-design flow from C/C++ programs down to synthesizable VHDL/Verilog and
+parallel program binaries. Processor customization points include the register
+files, function units, supported operations, and the interconnection network.
TCE uses Clang and LLVM for C/C++/OpenCL C language support, target independent
-optimizations and also for parts of code generation. It generates new
-LLVM-based code generators "on the fly" for the designed TTA processors and
-loads them in to the compiler backend as runtime libraries to avoid
-per-target recompilation of larger parts of the compiler chain.
+optimizations and also for parts of code generation. It generates new LLVM-based
+code generators "on the fly" for the designed TTA processors and loads them in
+to the compiler backend as runtime libraries to avoid per-target recompilation
+of larger parts of the compiler chain.
+
+Just-in-time Adaptive Decoder Engine (Jade)
+-------------------------------------------
+
+`Jade <https://github.com/orcc/jade>`_ (Just-in-time Adaptive Decoder Engine) is
+a generic video decoder engine using LLVM for just-in-time compilation of video
+decoder configurations. Those configurations are designed by MPEG Reconfigurable
+Video Coding (RVC) committee. MPEG RVC standard is built on a stream-based
+dataflow representation of decoders. It is composed of a standard library of
+coding tools written in RVC-CAL language and a dataflow configuration --- block
+diagram --- of a decoder.
+
+Jade project is hosted as part of the Open RVC-CAL Compiler (`Orcc
+<http://orcc.sf.net>`_) and requires it to translate the RVC-CAL standard
+library of video coding tools into an LLVM assembly code.
+
+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X and Windows and also Linux/PPC64. Ports to
+other architectures like ARM are underway.
Additional Information
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 306549fba4..349447fbbb 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -453,7 +453,8 @@ namespace llvm {
ExitLimit ComputeExitLimitFromCond(const Loop *L,
Value *ExitCond,
BasicBlock *TBB,
- BasicBlock *FBB);
+ BasicBlock *FBB,
+ bool IsSubExpr);
/// ComputeExitLimitFromICmp - Compute the number of times the backedge of
/// the specified loop will execute if its exit condition were a conditional
@@ -461,7 +462,8 @@ namespace llvm {
ExitLimit ComputeExitLimitFromICmp(const Loop *L,
ICmpInst *ExitCond,
BasicBlock *TBB,
- BasicBlock *FBB);
+ BasicBlock *FBB,
+ bool IsSubExpr);
/// ComputeLoadConstantCompareExitLimit - Given an exit condition
/// of 'icmp op load X, cst', try to see if we can compute the
@@ -483,7 +485,7 @@ namespace llvm {
/// HowFarToZero - Return the number of times an exit condition comparing
/// the specified value to zero will execute. If not computable, return
/// CouldNotCompute.
- ExitLimit HowFarToZero(const SCEV *V, const Loop *L);
+ ExitLimit HowFarToZero(const SCEV *V, const Loop *L, bool IsSubExpr);
/// HowFarToNonZero - Return the number of times an exit condition checking
/// the specified value for nonzero will execute. If not computable, return
@@ -495,7 +497,7 @@ namespace llvm {
/// computable, return CouldNotCompute. isSigned specifies whether the
/// less-than is signed.
ExitLimit HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
- const Loop *L, bool isSigned);
+ const Loop *L, bool isSigned, bool IsSubExpr);
/// getPredecessorWithUniqueSuccessorForBB - Return a predecessor of BB
/// (which may not be an immediate predecessor) which has exactly one
diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td
index ebfd03e484..c248517def 100644
--- a/include/llvm/IR/IntrinsicsNVVM.td
+++ b/include/llvm/IR/IntrinsicsNVVM.td
@@ -405,6 +405,8 @@ def llvm_anyi64ptr_ty : LLVMAnyPointerType<llvm_i64_ty>; // (space)i64*
// Sqrt
//
+ def int_nvvm_sqrt_f : GCCBuiltin<"__nvvm_sqrt_f">,
+ Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
def int_nvvm_sqrt_rn_ftz_f : GCCBuiltin<"__nvvm_sqrt_rn_ftz_f">,
Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
def int_nvvm_sqrt_rn_f : GCCBuiltin<"__nvvm_sqrt_rn_f">,
diff --git a/include/llvm/MC/MCELFObjectWriter.h b/include/llvm/MC/MCELFObjectWriter.h
index a59776d5cd..65dd1e8998 100644
--- a/include/llvm/MC/MCELFObjectWriter.h
+++ b/include/llvm/MC/MCELFObjectWriter.h
@@ -45,7 +45,14 @@ struct ELFRelocationEntry {
// Support lexicographic sorting.
bool operator<(const ELFRelocationEntry &RE) const {
- return RE.r_offset < r_offset;
+ if (RE.r_offset != r_offset)
+ return RE.r_offset < r_offset;
+ if (Type != RE.Type)
+ return Type < RE.Type;
+ if (Index != RE.Index)
+ return Index < RE.Index;
+ llvm_unreachable("ELFRelocs might be unstable!");
+ return 0;
}
};
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 6ea915fdb0..f876748af3 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -3937,10 +3937,19 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
/// before taking the branch. For loops with multiple exits, it may not be the
/// number times that the loop header executes because the loop may exit
/// prematurely via another branch.
+///
+/// FIXME: We conservatively call getBackedgeTakenCount(L) instead of
+/// getExitCount(L, ExitingBlock) to compute a safe trip count considering all
+/// loop exits. getExitCount() may return an exact count for this branch
+/// assuming no-signed-wrap. The number of well-defined iterations may actually
+/// be higher than this trip count if this exit test is skipped and the loop
+/// exits via a different branch. Ideally, getExitCount() would know whether it
+/// depends on a NSW assumption, and we would only fall back to a conservative
+/// trip count in that case.
unsigned ScalarEvolution::
-getSmallConstantTripCount(Loop *L, BasicBlock *ExitingBlock) {
+getSmallConstantTripCount(Loop *L, BasicBlock */*ExitingBlock*/) {
const SCEVConstant *ExitCount =
- dyn_cast<SCEVConstant>(getExitCount(L, ExitingBlock));
+ dyn_cast<SCEVConstant>(getBackedgeTakenCount(L));
if (!ExitCount)
return 0;
@@ -3967,8 +3976,8 @@ getSmallConstantTripCount(Loop *L, BasicBlock *ExitingBlock) {
/// As explained in the comments for getSmallConstantTripCount, this assumes
/// that control exits the loop via ExitingBlock.
unsigned ScalarEvolution::
-getSmallConstantTripMultiple(Loop *L, BasicBlock *ExitingBlock) {
- const SCEV *ExitCount = getExitCount(L, ExitingBlock);
+getSmallConstantTripMultiple(Loop *L, BasicBlock */*ExitingBlock*/) {
+ const SCEV *ExitCount = getBackedgeTakenCount(L);
if (ExitCount == getCouldNotCompute())
return 1;
@@ -3997,7 +4006,7 @@ getSmallConstantTripMultiple(Loop *L, BasicBlock *ExitingBlock) {
}
// getExitCount - Get the expression for the number of loop iterations for which
-// this loop is guaranteed not to exit via ExitintBlock. Otherwise return
+// this loop is guaranteed not to exit via ExitingBlock. Otherwise return
// SCEVCouldNotCompute.
const SCEV *ScalarEvolution::getExitCount(Loop *L, BasicBlock *ExitingBlock) {
return getBackedgeTakenInfo(L).getExact(ExitingBlock, this);
@@ -4382,26 +4391,36 @@ ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) {
// Proceed to the next level to examine the exit condition expression.
return ComputeExitLimitFromCond(L, ExitBr->getCondition(),
ExitBr->getSuccessor(0),
- ExitBr->getSuccessor(1));
+ ExitBr->getSuccessor(1),
+ /*IsSubExpr=*/false);
}
/// ComputeExitLimitFromCond - Compute the number of times the
/// backedge of the specified loop will execute if its exit condition
/// were a conditional branch of ExitCond, TBB, and FBB.
+///
+/// @param IsSubExpr is true if ExitCond does not directly control the exit
+/// branch. In this case, we cannot assume that the loop only exits when the
+/// condition is true and cannot infer that failing to meet the condition prior
+/// to integer wraparound results in undefined behavior.
ScalarEvolution::ExitLimit
ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
Value *ExitCond,
BasicBlock *TBB,
- BasicBlock *FBB) {
+ BasicBlock *FBB,
+ bool IsSubExpr) {
// Check if the controlling expression for this loop is an And or Or.
if (BinaryOperator *BO = dyn_cast<BinaryOperator>(ExitCond)) {
if (BO->getOpcode() == Instruction::And) {
// Recurse on the operands of the and.
- ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB);
- ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB);
+ bool EitherMayExit = L->contains(TBB);
+ ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
+ IsSubExpr || EitherMayExit);
+ ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
+ IsSubExpr || EitherMayExit);
const SCEV *BECount = getCouldNotCompute();
const SCEV *MaxBECount = getCouldNotCompute();
- if (L->contains(TBB)) {
+ if (EitherMayExit) {
// Both conditions must be true for the loop to continue executing.
// Choose the less conservative count.
if (EL0.Exact == getCouldNotCompute() ||
@@ -4429,11 +4448,14 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
}
if (BO->getOpcode() == Instruction::Or) {
// Recurse on the operands of the or.
- ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB);
- ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB);
+ bool EitherMayExit = L->contains(FBB);
+ ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
+ IsSubExpr || EitherMayExit);
+ ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
+ IsSubExpr || EitherMayExit);
const SCEV *BECount = getCouldNotCompute();
const SCEV *MaxBECount = getCouldNotCompute();
- if (L->contains(FBB)) {
+ if (EitherMayExit) {
// Both conditions must be false for the loop to continue executing.
// Choose the less conservative count.
if (EL0.Exact == getCouldNotCompute() ||
@@ -4464,7 +4486,7 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
// With an icmp, it may be feasible to compute an exact backedge-taken count.
// Proceed to the next level to examine the icmp.
if (ICmpInst *ExitCondICmp = dyn_cast<ICmpInst>(ExitCond))
- return ComputeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB);
+ return ComputeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, IsSubExpr);
// Check for a constant condition. These are normally stripped out by
// SimplifyCFG, but ScalarEvolution may be used by a pass which wishes to
@@ -4490,7 +4512,8 @@ ScalarEvolution::ExitLimit
ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
ICmpInst *ExitCond,
BasicBlock *TBB,
- BasicBlock *FBB) {
+ BasicBlock *FBB,
+ bool IsSubExpr) {
// If the condition was exit on true, convert the condition to exit on false
ICmpInst::Predicate Cond;
@@ -4542,7 +4565,7 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
switch (Cond) {
case ICmpInst::ICMP_NE: { // while (X != Y)
// Convert to: while (X-Y != 0)
- ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L);
+ ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L, IsSubExpr);
if (EL.hasAnyInfo()) return EL;
break;
}
@@ -4553,24 +4576,24 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
break;
}
case ICmpInst::ICMP_SLT: {
- ExitLimit EL = HowManyLessThans(LHS, RHS, L, true);
+ ExitLimit EL = HowManyLessThans(LHS, RHS, L, true, IsSubExpr);
if (EL.hasAnyInfo()) return EL;
break;
}
case ICmpInst::ICMP_SGT: {
ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
- getNotSCEV(RHS), L, true);
+ getNotSCEV(RHS), L, true, IsSubExpr);
if (EL.hasAnyInfo()) return EL;
break;
}
case ICmpInst::ICMP_ULT: {
- ExitLimit EL = HowManyLessThans(LHS, RHS, L, false);
+ ExitLimit EL = HowManyLessThans(LHS, RHS, L, false, IsSubExpr);
if (EL.hasAnyInfo()) return EL;
break;
}
case ICmpInst::ICMP_UGT: {
ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
- getNotSCEV(RHS), L, false);
+ getNotSCEV(RHS), L, false, IsSubExpr);
if (EL.hasAnyInfo()) return EL;
break;
}
@@ -5439,7 +5462,7 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
/// effectively V != 0. We know and take advantage of the fact that this
/// expression only being used in a comparison by zero context.
ScalarEvolution::ExitLimit
-ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
+ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool IsSubExpr) {
// If the value is a constant
if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
// If the value is already zero, the branch will execute zero times.
@@ -5537,19 +5560,20 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
}
// If the recurrence is known not to wraparound, unsigned divide computes the
- // back edge count. We know that the value will either become zero (and thus
- // the loop terminates), that the loop will terminate through some other exit
- // condition first, or that the loop has undefined behavior. This means
- // we can't "miss" the exit value, even with nonunit stride.
+ // back edge count. (Ideally we would have an "isexact" bit for udiv). We know
+ // that the value will either become zero (and thus the loop terminates), that
+ // the loop will terminate through some other exit condition first, or that
+ // the loop has undefined behavior. This means we can't "miss" the exit
+ // value, even with nonunit stride.
//
- // FIXME: Prove that loops always exhibits *acceptable* undefined
- // behavior. Loops must exhibit defined behavior until a wrapped value is
- // actually used. So the trip count computed by udiv could be smaller than the
- // number of well-defined iterations.
- if (AddRec->getNoWrapFlags(SCEV::FlagNW)) {
- // FIXME: We really want an "isexact" bit for udiv.
+ // This is only valid for expressions that directly compute the loop exit. It
+ // is invalid for subexpressions in which the loop may exit through this
+ // branch even if this subexpression is false. In that case, the trip count
+ // computed by this udiv could be smaller than the number of well-defined
+ // iterations.
+ if (!IsSubExpr && AddRec->getNoWrapFlags(SCEV::FlagNW))
return getUDivExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step);
- }
+
// Then, try to solve the above equation provided that Start is constant.
if (const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start))
return SolveLinEquationWithOverflow(StepC->getValue()->getValue(),
@@ -6315,9 +6339,14 @@ const SCEV *ScalarEvolution::getBECount(const SCEV *Start,
/// HowManyLessThans - Return the number of times a backedge containing the
/// specified less-than comparison will execute. If not computable, return
/// CouldNotCompute.
+///
+/// @param IsSubExpr is true when the LHS < RHS condition does not directly
+/// control the branch. In this case, we can only compute an iteration count for
+/// a subexpression that cannot overflow before evaluating true.
ScalarEvolution::ExitLimit
ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
- const Loop *L, bool isSigned) {
+ const Loop *L, bool isSigned,
+ bool IsSubExpr) {
// Only handle: "ADDREC < LoopInvariant".
if (!isLoopInvariant(RHS, L)) return getCouldNotCompute();
@@ -6326,10 +6355,12 @@ ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
return getCouldNotCompute();
// Check to see if we have a flag which makes analysis easy.
- bool NoWrap = isSigned ?
- AddRec->getNoWrapFlags((SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNW)) :
- AddRec->getNoWrapFlags((SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNW));
-
+ bool NoWrap = false;
+ if (!IsSubExpr) {
+ NoWrap = AddRec->getNoWrapFlags(
+ (SCEV::NoWrapFlags)(((isSigned ? SCEV::FlagNSW : SCEV::FlagNUW))
+ | SCEV::FlagNW));
+ }
if (AddRec->isAffine()) {
unsigned BitWidth = getTypeSizeInBits(AddRec->getType());
const SCEV *Step = AddRec->getStepRecurrence(*this);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index aeaa63f2af..73bba6989f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -708,6 +708,12 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
Asm->OutStreamer.getContext().setMCLineTableSymbol(LineTableStartSym,
NewCU->getUniqueID());
+ // Use a single line table if we are using .loc and generating assembly.
+ bool UseTheFirstCU =
+ (Asm->TM.hasMCUseLoc() &&
+ Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer) ||
+ (NewCU->getUniqueID() == 0);
+
// DW_AT_stmt_list is a offset of line number information for this
// compile unit in debug_line section. For split dwarf this is
// left in the skeleton CU and so not included.
@@ -716,9 +722,9 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
if (!useSplitDwarf()) {
if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
- NewCU->getUniqueID() == 0 ?
+ UseTheFirstCU ?
Asm->GetTempSymbol("section_line") : LineTableStartSym);
- else if (NewCU->getUniqueID() == 0)
+ else if (UseTheFirstCU)
NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
else
NewCU->addDelta(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
@@ -1441,7 +1447,12 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
CompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
assert(TheCU && "Unable to find compile unit!");
- Asm->OutStreamer.getContext().setDwarfCompileUnitID(TheCU->getUniqueID());
+ if (Asm->TM.hasMCUseLoc() &&
+ Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer)
+ // Use a single line table if we are using .loc and generating assembly.
+ Asm->OutStreamer.getContext().setDwarfCompileUnitID(0);
+ else
+ Asm->OutStreamer.getContext().setDwarfCompileUnitID(TheCU->getUniqueID());
FunctionBeginSym = Asm->GetTempSymbol("func_begin",
Asm->getFunctionNumber());
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index 07f0ccf52f..d894f664dc 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -453,6 +453,12 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 1));
break;
+ case Intrinsic::annotation:
+ case Intrinsic::ptr_annotation:
+ // Just drop the annotation, but forward the value
+ CI->replaceAllUsesWith(CI->getOperand(0));
+ break;
+
case Intrinsic::var_annotation:
break; // Strip out annotate intrinsic
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 2ded723ca0..67db211ec4 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5034,6 +5034,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
setValue(&I, Res);
return 0;
}
+ case Intrinsic::annotation:
+ case Intrinsic::ptr_annotation:
+ // Drop the intrinsic, but forward the value
+ setValue(&I, getValue(I.getOperand(0)));
+ return 0;
case Intrinsic::var_annotation:
// Discard annotate attributes
return 0;
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 2368e9e86a..edefdb4c36 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -1092,7 +1092,7 @@ bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
MCBinaryExpr::Opcode Dummy;
unsigned NextTokPrec = getBinOpPrecedence(Lexer.getKind(), Dummy);
if (TokPrec < NextTokPrec) {
- if (ParseBinOpRHS(Precedence+1, RHS, EndLoc)) return true;
+ if (ParseBinOpRHS(TokPrec+1, RHS, EndLoc)) return true;
}
// Merge LHS and RHS according to operator.
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 4839c3470c..8f1895e048 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -35,6 +35,7 @@ MCStreamer::~MCStreamer() {
void MCStreamer::reset() {
for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i)
delete W64UnwindInfos[i];
+ W64UnwindInfos.clear();
EmitEHFrame = true;
EmitDebugFrame = false;
CurrentW64UnwindInfo = 0;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0f7beb1e3b..e49cfc4985 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -5257,6 +5257,23 @@ static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
return false;
}
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+ if (OrigVT.getSizeInBits() >= 64)
+ return OrigVT;
+
+ assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+ MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+ switch (OrigSimpleTy) {
+ default: llvm_unreachable("Unexpected Vector Type");
+ case MVT::v2i8:
+ case MVT::v2i16:
+ return MVT::v2i32;
+ case MVT::v4i8:
+ return MVT::v4i16;
+ }
+}
+
/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
/// We insert the required extension here to get the vector to fill a D register.
@@ -5272,18 +5289,8 @@ static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
return N;
// Must extend size to at least 64 bits to be used as an operand for VMULL.
- MVT::SimpleValueType OrigSimpleTy = OrigTy.getSimpleVT().SimpleTy;
- EVT NewVT;
- switch (OrigSimpleTy) {
- default: llvm_unreachable("Unexpected Orig Vector Type");
- case MVT::v2i8:
- case MVT::v2i16:
- NewVT = MVT::v2i32;
- break;
- case MVT::v4i8:
- NewVT = MVT::v4i16;
- break;
- }
+ EVT NewVT = getExtensionTo64Bits(OrigTy);
+
return DAG.getNode(ExtOpcode, N->getDebugLoc(), NewVT, N);
}
@@ -5293,22 +5300,22 @@ static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
/// reach a total size of 64 bits. We have to add the extension separately
/// because ARM does not have a sign/zero extending load for vectors.
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
- SDValue NonExtendingLoad =
- DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(),
+ EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
+
+ // The load already has the right type.
+ if (ExtendedTy == LD->getMemoryVT())
+ return DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(),
LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
LD->isNonTemporal(), LD->isInvariant(),
LD->getAlignment());
- unsigned ExtOp = 0;
- switch (LD->getExtensionType()) {
- default: llvm_unreachable("Unexpected LoadExtType");
- case ISD::EXTLOAD:
- case ISD::SEXTLOAD: ExtOp = ISD::SIGN_EXTEND; break;
- case ISD::ZEXTLOAD: ExtOp = ISD::ZERO_EXTEND; break;
- }
- MVT::SimpleValueType MemType = LD->getMemoryVT().getSimpleVT().SimpleTy;
- MVT::SimpleValueType ExtType = LD->getValueType(0).getSimpleVT().SimpleTy;
- return AddRequiredExtensionForVMULL(NonExtendingLoad, DAG,
- MemType, ExtType, ExtOp);
+
+ // We need to create a zextload/sextload. We cannot just create a load
+ // followed by a zext/zext node because LowerMUL is also run during normal
+ // operation legalization where we can't create illegal types.
+ return DAG.getExtLoad(LD->getExtensionType(), LD->getDebugLoc(), ExtendedTy,
+ LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
+ LD->getMemoryVT(), LD->isVolatile(),
+ LD->isNonTemporal(), LD->getAlignment());
}
/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index 7da2fed4cd..735ca9b6b7 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -23,6 +23,7 @@ set(NVPTXCodeGen_sources
NVPTXAsmPrinter.cpp
NVPTXUtilities.cpp
NVVMReflect.cpp
+ NVPTXGenericToNVVM.cpp
)
add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 6a53a443bf..072c65da35 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -16,6 +16,7 @@
#define LLVM_TARGET_NVPTX_H
#include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/ADT/StringMap.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Value.h"
#include "llvm/Support/ErrorHandling.h"
@@ -62,6 +63,9 @@ createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel);
FunctionPass *createLowerStructArgsPass(NVPTXTargetMachine &);
FunctionPass *createNVPTXReMatPass(NVPTXTargetMachine &);
FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &);
+ModulePass *createGenericToNVVMPass();
+ModulePass *createNVVMReflectPass();
+ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
bool isImageOrSamplerVal(const Value *, const Module *);
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index ce5d78afa3..229e4e5980 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -68,11 +68,12 @@ InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore,
namespace {
/// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
/// depends.
-void DiscoverDependentGlobals(Value *V, DenseSet<GlobalVariable *> &Globals) {
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+void DiscoverDependentGlobals(const Value *V,
+ DenseSet<const GlobalVariable *> &Globals) {
+ if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
Globals.insert(GV);
else {
- if (User *U = dyn_cast<User>(V)) {
+ if (const User *U = dyn_cast<User>(V)) {
for (unsigned i = 0, e = U->getNumOperands(); i != e; ++i) {
DiscoverDependentGlobals(U->getOperand(i), Globals);
}
@@ -84,8 +85,9 @@ void DiscoverDependentGlobals(Value *V, DenseSet<GlobalVariable *> &Globals) {
/// instances to be emitted, but only after any dependents have been added
/// first.
void VisitGlobalVariableForEmission(
- GlobalVariable *GV, SmallVectorImpl<GlobalVariable *> &Order,
- DenseSet<GlobalVariable *> &Visited, DenseSet<GlobalVariable *> &Visiting) {
+ const GlobalVariable *GV, SmallVectorImpl<const GlobalVariable *> &Order,
+ DenseSet<const GlobalVariable *> &Visited,
+ DenseSet<const GlobalVariable *> &Visiting) {
// Have we already visited this one?
if (Visited.count(GV))
return;
@@ -98,12 +100,12 @@ void VisitGlobalVariableForEmission(
Visiting.insert(GV);
// Make sure we visit all dependents first
- DenseSet<GlobalVariable *> Others;
+ DenseSet<const GlobalVariable *> Others;
for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i)
DiscoverDependentGlobals(GV->getOperand(i), Others);
- for (DenseSet<GlobalVariable *>::iterator I = Others.begin(),
- E = Others.end();
+ for (DenseSet<const GlobalVariable *>::iterator I = Others.begin(),
+ E = Others.end();
I != E; ++I)
VisitGlobalVariableForEmission(*I, Order, Visited, Visiting);
@@ -405,6 +407,11 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
SmallString<128> Str;
raw_svector_ostream O(Str);
+ if (!GlobalsEmitted) {
+ emitGlobals(*MF->getFunction()->getParent());
+ GlobalsEmitted = true;
+ }
+
// Set up
MRI = &MF->getRegInfo();
F = MF->getFunction();
@@ -695,7 +702,7 @@ void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
else
O << ".func ";
printReturnValStr(F, O);
- O << *CurrentFnSym << "\n";
+ O << *Mang->getSymbol(F) << "\n";
emitFunctionParamList(F, O);
O << ";\n";
}
@@ -795,7 +802,7 @@ static bool useFuncSeen(const Constant *C,
return false;
}
-void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) {
+void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
llvm::DenseMap<const Function *, bool> seenMap;
for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
const Function *F = FI;
@@ -805,7 +812,6 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) {
continue;
if (F->getIntrinsicID())
continue;
- CurrentFnSym = Mang->getSymbol(F);
emitDeclaration(F, O);
continue;
}
@@ -817,14 +823,12 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) {
// The use is in the initialization of a global variable
// that is a function pointer, so print a declaration
// for the original function
- CurrentFnSym = Mang->getSymbol(F);
emitDeclaration(F, O);
break;
}
// Emit a declaration of this function if the function that
// uses this constant expr has already been seen.
if (useFuncSeen(C, seenMap)) {
- CurrentFnSym = Mang->getSymbol(F);
emitDeclaration(F, O);
break;
}
@@ -844,7 +848,6 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) {
// appearing in the module before the callee. so print out
// a declaration for the callee.
if (seenMap.find(caller) != seenMap.end()) {
- CurrentFnSym = Mang->getSymbol(F);
emitDeclaration(F, O);
break;
}
@@ -921,6 +924,12 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
recordAndEmitFilenames(M);
+ GlobalsEmitted = false;
+
+ return false; // success
+}
+
+void NVPTXAsmPrinter::emitGlobals(const Module &M) {
SmallString<128> Str2;
raw_svector_ostream OS2(Str2);
@@ -931,13 +940,13 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
// global variable in order, and ensure that we emit it *after* its dependent
// globals. We use a little extra memory maintaining both a set and a list to
// have fast searches while maintaining a strict ordering.
- SmallVector<GlobalVariable *, 8> Globals;
- DenseSet<GlobalVariable *> GVVisited;
- DenseSet<GlobalVariable *> GVVisiting;
+ SmallVector<const GlobalVariable *, 8> Globals;
+ DenseSet<const GlobalVariable *> GVVisited;
+ DenseSet<const GlobalVariable *> GVVisiting;
// Visit each global variable, in order
- for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E;
- ++I)
+ for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ++I)
VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting);
assert(GVVisited.size() == M.getGlobalList().size() &&
@@ -951,7 +960,6 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
OS2 << '\n';
OutStreamer.EmitRawText(OS2.str());
- return false; // success
}
void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) {
@@ -989,6 +997,14 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) {
}
bool NVPTXAsmPrinter::doFinalization(Module &M) {
+
+ // If we did not emit any functions, then the global declarations have not
+ // yet been emitted.
+ if (!GlobalsEmitted) {
+ emitGlobals(M);
+ GlobalsEmitted = true;
+ }
+
// XXX Temproarily remove global variables so that doFinalization() will not
// emit them again (global variables are emitted at beginning).
@@ -1063,7 +1079,8 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
}
}
-void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
+void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
+ raw_ostream &O,
bool processDemoted) {
// Skip meta data
@@ -1107,10 +1124,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
if (llvm::isSampler(*GVar)) {
O << ".global .samplerref " << llvm::getSamplerName(*GVar);
- Constant *Initializer = NULL;
+ const Constant *Initializer = NULL;
if (GVar->hasInitializer())
Initializer = GVar->getInitializer();
- ConstantInt *CI = NULL;
+ const ConstantInt *CI = NULL;
if (Initializer)
CI = dyn_cast<ConstantInt>(Initializer);
if (CI) {
@@ -1183,7 +1200,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
if (localDecls.find(demotedFunc) != localDecls.end())
localDecls[demotedFunc].push_back(GVar);
else {
- std::vector<GlobalVariable *> temp;
+ std::vector<const GlobalVariable *> temp;
temp.push_back(GVar);
localDecls[demotedFunc] = temp;
}
@@ -1199,7 +1216,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) {
O << " .";
- O << getPTXFundamentalTypeStr(ETy, false);
+ // Special case: ABI requires that we use .u8 for predicates
+ if (ETy->isIntegerTy(1))
+ O << "u8";
+ else
+ O << getPTXFundamentalTypeStr(ETy, false);
O << " ";
O << *Mang->getSymbol(GVar);
@@ -1209,7 +1230,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
(PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
(PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
GVar->hasInitializer()) {
- Constant *Initializer = GVar->getInitializer();
+ const Constant *Initializer = GVar->getInitializer();
if (!Initializer->isNullValue()) {
O << " = ";
printScalarConstant(Initializer, O);
@@ -1233,7 +1254,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
(PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
(PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
GVar->hasInitializer()) {
- Constant *Initializer = GVar->getInitializer();
+ const Constant *Initializer = GVar->getInitializer();
if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
AggBuffer aggBuffer(ElementSize, O, *this);
bufferAggregateConstant(Initializer, &aggBuffer);
@@ -1283,7 +1304,7 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
if (localDecls.find(f) == localDecls.end())
return;
- std::vector<GlobalVariable *> &gvars = localDecls[f];
+ std::vector<const GlobalVariable *> &gvars = localDecls[f];
for (unsigned i = 0, e = gvars.size(); i != e; ++i) {
O << "\t// demoted variable\n\t";
@@ -1448,7 +1469,7 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
int paramIndex, raw_ostream &O) {
if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
(nvptxSubtarget.getDrvInterface() == NVPTX::CUDA))
- O << *CurrentFnSym << "_param_" << paramIndex;
+ O << *Mang->getSymbol(I->getParent()) << "_param_" << paramIndex;
else {
std::string argName = I->getName();
const char *p = argName.c_str();
@@ -1507,11 +1528,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
if (llvm::isImage(*I)) {
std::string sname = I->getName();
if (llvm::isImageWriteOnly(*I))
- O << "\t.param .surfref " << *CurrentFnSym << "_param_" << paramIndex;
+ O << "\t.param .surfref " << *Mang->getSymbol(F) << "_param_"
+ << paramIndex;
else // Default image is read_only
- O << "\t.param .texref " << *CurrentFnSym << "_param_" << paramIndex;
+ O << "\t.param .texref " << *Mang->getSymbol(F) << "_param_"
+ << paramIndex;
} else // Should be llvm::isSampler(*I)
- O << "\t.param .samplerref " << *CurrentFnSym << "_param_"
+ O << "\t.param .samplerref " << *Mang->getSymbol(F) << "_param_"
<< paramIndex;
continue;
}
@@ -1564,7 +1587,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
}
// non-pointer scalar to kernel func
- O << "\t.param ." << getPTXFundamentalTypeStr(Ty) << " ";
+ O << "\t.param .";
+ // Special case: predicate operands become .u8 types
+ if (Ty->isIntegerTy(1))
+ O << "u8";
+ else
+ O << getPTXFundamentalTypeStr(Ty);
+ O << " ";
printParamName(I, paramIndex, O);
continue;
}
@@ -1751,12 +1780,12 @@ void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) {
O << utohexstr(API.getZExtValue());
}
-void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) {
- if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
O << CI->getValue();
return;
}
- if (ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) {
+ if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) {
printFPConstant(CFP, O);
return;
}
@@ -1764,13 +1793,13 @@ void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) {
O << "0";
return;
}
- if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+ if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
O << *Mang->getSymbol(GVar);
return;
}
- if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
- Value *v = Cexpr->stripPointerCasts();
- if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+ if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+ const Value *v = Cexpr->stripPointerCasts();
+ if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
O << *Mang->getSymbol(GVar);
return;
} else {
@@ -1781,7 +1810,7 @@ void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) {
llvm_unreachable("Not scalar type found in printScalarConstant()");
}
-void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
+void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
AggBuffer *aggBuffer) {
const DataLayout *TD = TM.getDataLayout();
@@ -1809,13 +1838,13 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
ptr = (unsigned char *)&int16;
aggBuffer->addBytes(ptr, 2, Bytes);
} else if (ETy == Type::getInt32Ty(CPV->getContext())) {
- if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+ if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
int int32 = (int)(constInt->getZExtValue());
ptr = (unsigned char *)&int32;
aggBuffer->addBytes(ptr, 4, Bytes);
break;
- } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
- if (ConstantInt *constInt = dyn_cast<ConstantInt>(
+ } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+ if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
ConstantFoldConstantExpression(Cexpr, TD))) {
int int32 = (int)(constInt->getZExtValue());
ptr = (unsigned char *)&int32;
@@ -1831,13 +1860,13 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
}
llvm_unreachable("unsupported integer const type");
} else if (ETy == Type::getInt64Ty(CPV->getContext())) {
- if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+ if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
long long int64 = (long long)(constInt->getZExtValue());
ptr = (unsigned char *)&int64;
aggBuffer->addBytes(ptr, 8, Bytes);
break;
- } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
- if (ConstantInt *constInt = dyn_cast<ConstantInt>(
+ } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+ if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
ConstantFoldConstantExpression(Cexpr, TD))) {
long long int64 = (long long)(constInt->getZExtValue());
ptr = (unsigned char *)&int64;
@@ -1858,7 +1887,7 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
}
case Type::FloatTyID:
case Type::DoubleTyID: {
- ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+ const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
const Type *Ty = CFP->getType();
if (Ty == Type::getFloatTy(CPV->getContext())) {
float float32 = (float) CFP->getValueAPF().convertToFloat();
@@ -1874,10 +1903,10 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
break;
}
case Type::PointerTyID: {
- if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+ if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
aggBuffer->addSymbol(GVar);
- } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
- Value *v = Cexpr->stripPointerCasts();
+ } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+ const Value *v = Cexpr->stripPointerCasts();
aggBuffer->addSymbol(v);
}
unsigned int s = TD->getTypeAllocSize(CPV->getType());
@@ -1906,7 +1935,7 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
}
}
-void NVPTXAsmPrinter::bufferAggregateConstant(Constant *CPV,
+void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
AggBuffer *aggBuffer) {
const DataLayout *TD = TM.getDataLayout();
int Bytes;
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 6dc9fc0ffe..7faa6b265b 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -91,7 +91,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
unsigned char *buffer; // the buffer
unsigned numSymbols; // number of symbol addresses
SmallVector<unsigned, 4> symbolPosInBuffer;
- SmallVector<Value *, 4> Symbols;
+ SmallVector<const Value *, 4> Symbols;
private:
unsigned curpos;
@@ -128,7 +128,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
}
return curpos;
}
- void addSymbol(Value *GVar) {
+ void addSymbol(const Value *GVar) {
symbolPosInBuffer.push_back(curpos);
Symbols.push_back(GVar);
numSymbols++;
@@ -153,11 +153,11 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
if (pos)
O << ", ";
if (pos == nextSymbolPos) {
- Value *v = Symbols[nSym];
- if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+ const Value *v = Symbols[nSym];
+ if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
MCSymbol *Name = AP.Mang->getSymbol(GVar);
O << *Name;
- } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
+ } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
O << *nvptx::LowerConstant(Cexpr, AP);
} else
llvm_unreachable("symbol type unknown");
@@ -205,10 +205,12 @@ private:
void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
// definition autogenerated.
void printInstruction(const MachineInstr *MI, raw_ostream &O);
- void printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, bool = false);
+ void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
+ bool = false);
void printParamName(int paramIndex, raw_ostream &O);
void printParamName(Function::const_arg_iterator I, int paramIndex,
raw_ostream &O);
+ void emitGlobals(const Module &M);
void emitHeader(Module &M, raw_ostream &O);
void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
void emitVirtualRegister(unsigned int vr, bool isVec, raw_ostream &O);
@@ -234,6 +236,8 @@ protected:
private:
std::string CurrentBankselLabelInBasicBlock;
+ bool GlobalsEmitted;
+
// This is specific per MachineFunction.
const MachineRegisterInfo *MRI;
// The contents are specific for each
@@ -247,7 +251,7 @@ private:
std::map<const Type *, std::string> TypeNameMap;
// List of variables demoted to a function scope.
- std::map<const Function *, std::vector<GlobalVariable *> > localDecls;
+ std::map<const Function *, std::vector<const GlobalVariable *> > localDecls;
// To record filename to ID mapping
std::map<std::string, unsigned> filenameMap;
@@ -256,15 +260,15 @@ private:
void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const;
std::string getPTXFundamentalTypeStr(const Type *Ty, bool = true) const;
- void printScalarConstant(Constant *CPV, raw_ostream &O);
+ void printScalarConstant(const Constant *CPV, raw_ostream &O);
void printFPConstant(const ConstantFP *Fp, raw_ostream &O);
- void bufferLEByte(Constant *CPV, int Bytes, AggBuffer *aggBuffer);
- void bufferAggregateConstant(Constant *CV, AggBuffer *aggBuffer);
+ void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer);
+ void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer);
void printOperandProper(const MachineOperand &MO);
void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
- void emitDeclarations(Module &, raw_ostream &O);
+ void emitDeclarations(const Module &, raw_ostream &O);
void emitDeclaration(const Function *, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
new file mode 100644
index 0000000000..1077c46fb4
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -0,0 +1,436 @@
+//===-- GenericToNVVM.cpp - Convert generic module to NVVM module - C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Convert generic global variables into either .global or .const access based
+// on the variable's "constant" qualifier.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+
+#include "llvm/PassManager.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/IRBuilder.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeGenericToNVVMPass(PassRegistry &);
+}
+
+namespace {
+class GenericToNVVM : public ModulePass {
+public:
+ static char ID;
+
+ GenericToNVVM() : ModulePass(ID) {}
+
+ virtual bool runOnModule(Module &M);
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ }
+
+private:
+ Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV,
+ IRBuilder<> &Builder);
+ Value *remapConstant(Module *M, Function *F, Constant *C,
+ IRBuilder<> &Builder);
+ Value *remapConstantVectorOrConstantAggregate(Module *M, Function *F,
+ Constant *C,
+ IRBuilder<> &Builder);
+ Value *remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+ IRBuilder<> &Builder);
+ void remapNamedMDNode(Module *M, NamedMDNode *N);
+ MDNode *remapMDNode(Module *M, MDNode *N);
+
+ typedef ValueMap<GlobalVariable *, GlobalVariable *> GVMapTy;
+ typedef ValueMap<Constant *, Value *> ConstantToValueMapTy;
+ GVMapTy GVMap;
+ ConstantToValueMapTy ConstantToValueMap;
+};
+}
+
+char GenericToNVVM::ID = 0;
+
+ModulePass *llvm::createGenericToNVVMPass() { return new GenericToNVVM(); }
+
+INITIALIZE_PASS(
+ GenericToNVVM, "generic-to-nvvm",
+ "Ensure that the global variables are in the global address space", false,
+ false)
+
+bool GenericToNVVM::runOnModule(Module &M) {
+ // Create a clone of each global variable that has the default address space.
+ // The clone is created with the global address space specifier, and the pair
+ // of original global variable and its clone is placed in the GVMap for later
+ // use.
+
+ for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+ I != E;) {
+ GlobalVariable *GV = I++;
+ if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC &&
+ !llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
+ !GV->getName().startswith("llvm.")) {
+ GlobalVariable *NewGV = new GlobalVariable(
+ M, GV->getType()->getElementType(), GV->isConstant(),
+ GV->getLinkage(), GV->hasInitializer() ? GV->getInitializer() : NULL,
+ "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
+ NewGV->copyAttributesFrom(GV);
+ GVMap[GV] = NewGV;
+ }
+ }
+
+ // Return immediately, if every global variable has a specific address space
+ // specifier.
+ if (GVMap.empty()) {
+ return false;
+ }
+
+ // Walk through the instructions in function defitinions, and replace any use
+ // of original global variables in GVMap with a use of the corresponding
+ // copies in GVMap. If necessary, promote constants to instructions.
+ for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+ if (I->isDeclaration()) {
+ continue;
+ }
+ IRBuilder<> Builder(I->getEntryBlock().getFirstNonPHIOrDbg());
+ for (Function::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE;
+ ++BBI) {
+ for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE;
+ ++II) {
+ for (unsigned i = 0, e = II->getNumOperands(); i < e; ++i) {
+ Value *Operand = II->getOperand(i);
+ if (isa<Constant>(Operand)) {
+ II->setOperand(
+ i, remapConstant(&M, I, cast<Constant>(Operand), Builder));
+ }
+ }
+ }
+ }
+ ConstantToValueMap.clear();
+ }
+
+ // Walk through the metadata section and update the debug information
+ // associated with the global variables in the default address space.
+ for (Module::named_metadata_iterator I = M.named_metadata_begin(),
+ E = M.named_metadata_end();
+ I != E; I++) {
+ remapNamedMDNode(&M, I);
+ }
+
+ // Walk through the global variable initializers, and replace any use of
+ // original global variables in GVMap with a use of the corresponding copies
+ // in GVMap. The copies need to be bitcast to the original global variable
+ // types, as we cannot use cvta in global variable initializers.
+ for (GVMapTy::iterator I = GVMap.begin(), E = GVMap.end(); I != E;) {
+ GlobalVariable *GV = I->first;
+ GlobalVariable *NewGV = I->second;
+ ++I;
+ Constant *BitCastNewGV = ConstantExpr::getBitCast(NewGV, GV->getType());
+ // At this point, the remaining uses of GV should be found only in global
+ // variable initializers, as other uses have been already been removed
+ // while walking through the instructions in function definitions.
+ for (Value::use_iterator UI = GV->use_begin(), UE = GV->use_end();
+ UI != UE;) {
+ Use &U = (UI++).getUse();
+ U.set(BitCastNewGV);
+ }
+ std::string Name = GV->getName();
+ GV->removeDeadConstantUsers();
+ GV->eraseFromParent();
+ NewGV->setName(Name);
+ }
+ GVMap.clear();
+
+ return true;
+}
+
+Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
+ GlobalVariable *GV,
+ IRBuilder<> &Builder) {
+ PointerType *GVType = GV->getType();
+ Value *CVTA = NULL;
+
+ // See if the address space conversion requires the operand to be bitcast
+ // to i8 addrspace(n)* first.
+ EVT ExtendedGVType = EVT::getEVT(GVType->getElementType(), true);
+ if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) {
+ // A bitcast to i8 addrspace(n)* on the operand is needed.
+ LLVMContext &Context = M->getContext();
+ unsigned int AddrSpace = GVType->getAddressSpace();
+ Type *DestTy = PointerType::get(Type::getInt8Ty(Context), AddrSpace);
+ CVTA = Builder.CreateBitCast(GV, DestTy, "cvta");
+ // Insert the address space conversion.
+ Type *ResultType =
+ PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC);
+ SmallVector<Type *, 2> ParamTypes;
+ ParamTypes.push_back(ResultType);
+ ParamTypes.push_back(DestTy);
+ Function *CVTAFunction = Intrinsic::getDeclaration(
+ M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+ CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta");
+ // Another bitcast from i8 * to <the element type of GVType> * is
+ // required.
+ DestTy =
+ PointerType::get(GVType->getElementType(), llvm::ADDRESS_SPACE_GENERIC);
+ CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta");
+ } else {
+ // A simple CVTA is enough.
+ SmallVector<Type *, 2> ParamTypes;
+ ParamTypes.push_back(PointerType::get(GVType->getElementType(),
+ llvm::ADDRESS_SPACE_GENERIC));
+ ParamTypes.push_back(GVType);
+ Function *CVTAFunction = Intrinsic::getDeclaration(
+ M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+ CVTA = Builder.CreateCall(CVTAFunction, GV, "cvta");
+ }
+
+ return CVTA;
+}
+
+Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
+ IRBuilder<> &Builder) {
+ // If the constant C has been converted already in the given function F, just
+ // return the converted value.
+ ConstantToValueMapTy::iterator CTII = ConstantToValueMap.find(C);
+ if (CTII != ConstantToValueMap.end()) {
+ return CTII->second;
+ }
+
+ Value *NewValue = C;
+ if (isa<GlobalVariable>(C)) {
+ // If the constant C is a global variable and is found in GVMap, generate a
+ // set set of instructions that convert the clone of C with the global
+ // address space specifier to a generic pointer.
+ // The constant C cannot be used here, as it will be erased from the
+ // module eventually. And the clone of C with the global address space
+ // specifier cannot be used here either, as it will affect the types of
+ // other instructions in the function. Hence, this address space conversion
+ // is required.
+ GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(C));
+ if (I != GVMap.end()) {
+ NewValue = getOrInsertCVTA(M, F, I->second, Builder);
+ }
+ } else if (isa<ConstantVector>(C) || isa<ConstantArray>(C) ||
+ isa<ConstantStruct>(C)) {
+ // If any element in the constant vector or aggregate C is or uses a global
+ // variable in GVMap, the constant C needs to be reconstructed, using a set
+ // of instructions.
+ NewValue = remapConstantVectorOrConstantAggregate(M, F, C, Builder);
+ } else if (isa<ConstantExpr>(C)) {
+ // If any operand in the constant expression C is or uses a global variable
+ // in GVMap, the constant expression C needs to be reconstructed, using a
+ // set of instructions.
+ NewValue = remapConstantExpr(M, F, cast<ConstantExpr>(C), Builder);
+ }
+
+ ConstantToValueMap[C] = NewValue;
+ return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantVectorOrConstantAggregate(
+ Module *M, Function *F, Constant *C, IRBuilder<> &Builder) {
+ bool OperandChanged = false;
+ SmallVector<Value *, 4> NewOperands;
+ unsigned NumOperands = C->getNumOperands();
+
+ // Check if any element is or uses a global variable in GVMap, and thus
+ // converted to another value.
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ Value *Operand = C->getOperand(i);
+ Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+ OperandChanged |= Operand != NewOperand;
+ NewOperands.push_back(NewOperand);
+ }
+
+ // If none of the elements has been modified, return C as it is.
+ if (!OperandChanged) {
+ return C;
+ }
+
+ // If any of the elements has been modified, construct the equivalent
+ // vector or aggregate value with a set instructions and the converted
+ // elements.
+ Value *NewValue = UndefValue::get(C->getType());
+ if (isa<ConstantVector>(C)) {
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ Value *Idx = ConstantInt::get(Type::getInt32Ty(M->getContext()), i);
+ NewValue = Builder.CreateInsertElement(NewValue, NewOperands[i], Idx);
+ }
+ } else {
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ NewValue =
+ Builder.CreateInsertValue(NewValue, NewOperands[i], makeArrayRef(i));
+ }
+ }
+
+ return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+ IRBuilder<> &Builder) {
+ bool OperandChanged = false;
+ SmallVector<Value *, 4> NewOperands;
+ unsigned NumOperands = C->getNumOperands();
+
+ // Check if any operand is or uses a global variable in GVMap, and thus
+ // converted to another value.
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ Value *Operand = C->getOperand(i);
+ Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+ OperandChanged |= Operand != NewOperand;
+ NewOperands.push_back(NewOperand);
+ }
+
+ // If none of the operands has been modified, return C as it is.
+ if (!OperandChanged) {
+ return C;
+ }
+
+ // If any of the operands has been modified, construct the instruction with
+ // the converted operands.
+ unsigned Opcode = C->getOpcode();
+ switch (Opcode) {
+ case Instruction::ICmp:
+ // CompareConstantExpr (icmp)
+ return Builder.CreateICmp(CmpInst::Predicate(C->getPredicate()),
+ NewOperands[0], NewOperands[1]);
+ case Instruction::FCmp:
+ // CompareConstantExpr (fcmp)
+ assert(false && "Address space conversion should have no effect "
+ "on float point CompareConstantExpr (fcmp)!");
+ return C;
+ case Instruction::ExtractElement:
+ // ExtractElementConstantExpr
+ return Builder.CreateExtractElement(NewOperands[0], NewOperands[1]);
+ case Instruction::InsertElement:
+ // InsertElementConstantExpr
+ return Builder.CreateInsertElement(NewOperands[0], NewOperands[1],
+ NewOperands[2]);
+ case Instruction::ShuffleVector:
+ // ShuffleVector
+ return Builder.CreateShuffleVector(NewOperands[0], NewOperands[1],
+ NewOperands[2]);
+ case Instruction::ExtractValue:
+ // ExtractValueConstantExpr
+ return Builder.CreateExtractValue(NewOperands[0], C->getIndices());
+ case Instruction::InsertValue:
+ // InsertValueConstantExpr
+ return Builder.CreateInsertValue(NewOperands[0], NewOperands[1],
+ C->getIndices());
+ case Instruction::GetElementPtr:
+ // GetElementPtrConstantExpr
+ return cast<GEPOperator>(C)->isInBounds()
+ ? Builder.CreateGEP(
+ NewOperands[0],
+ makeArrayRef(&NewOperands[1], NumOperands - 1))
+ : Builder.CreateInBoundsGEP(
+ NewOperands[0],
+ makeArrayRef(&NewOperands[1], NumOperands - 1));
+ case Instruction::Select:
+ // SelectConstantExpr
+ return Builder.CreateSelect(NewOperands[0], NewOperands[1], NewOperands[2]);
+ default:
+ // BinaryConstantExpr
+ if (Instruction::isBinaryOp(Opcode)) {
+ return Builder.CreateBinOp(Instruction::BinaryOps(C->getOpcode()),
+ NewOperands[0], NewOperands[1]);
+ }
+ // UnaryConstantExpr
+ if (Instruction::isCast(Opcode)) {
+ return Builder.CreateCast(Instruction::CastOps(C->getOpcode()),
+ NewOperands[0], C->getType());
+ }
+ assert(false && "GenericToNVVM encountered an unsupported ConstantExpr");
+ return C;
+ }
+}
+
+void GenericToNVVM::remapNamedMDNode(Module *M, NamedMDNode *N) {
+
+ bool OperandChanged = false;
+ SmallVector<MDNode *, 16> NewOperands;
+ unsigned NumOperands = N->getNumOperands();
+
+ // Check if any operand is or contains a global variable in GVMap, and thus
+ // converted to another value.
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ MDNode *Operand = N->getOperand(i);
+ MDNode *NewOperand = remapMDNode(M, Operand);
+ OperandChanged |= Operand != NewOperand;
+ NewOperands.push_back(NewOperand);
+ }
+
+ // If none of the operands has been modified, return immediately.
+ if (!OperandChanged) {
+ return;
+ }
+
+ // Replace the old operands with the new operands.
+ N->dropAllReferences();
+ for (SmallVector<MDNode *, 16>::iterator I = NewOperands.begin(),
+ E = NewOperands.end();
+ I != E; ++I) {
+ N->addOperand(*I);
+ }
+}
+
+MDNode *GenericToNVVM::remapMDNode(Module *M, MDNode *N) {
+
+ bool OperandChanged = false;
+ SmallVector<Value *, 8> NewOperands;
+ unsigned NumOperands = N->getNumOperands();
+
+ // Check if any operand is or contains a global variable in GVMap, and thus
+ // converted to another value.
+ for (unsigned i = 0; i < NumOperands; ++i) {
+ Value *Operand = N->getOperand(i);
+ Value *NewOperand = Operand;
+ if (Operand) {
+ if (isa<GlobalVariable>(Operand)) {
+ GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(Operand));
+ if (I != GVMap.end()) {
+ NewOperand = I->second;
+ if (++i < NumOperands) {
+ NewOperands.push_back(NewOperand);
+ // Address space of the global variable follows the global variable
+ // in the global variable debug info (see createGlobalVariable in
+ // lib/Analysis/DIBuilder.cpp).
+ NewOperand =
+ ConstantInt::get(Type::getInt32Ty(M->getContext()),
+ I->second->getType()->getAddressSpace());
+ }
+ }
+ } else if (isa<MDNode>(Operand)) {
+ NewOperand = remapMDNode(M, cast<MDNode>(Operand));
+ }
+ }
+ OperandChanged |= Operand != NewOperand;
+ NewOperands.push_back(NewOperand);
+ }
+
+ // If none of the operands has been modified, return N as it is.
+ if (!OperandChanged) {
+ return N;
+ }
+
+ // If any of the operands has been modified, create a new MDNode with the new
+ // operands.
+ return MDNode::get(M->getContext(), makeArrayRef(NewOperands));
+}
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 0f4c8dbce5..d4378c2322 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -42,6 +42,11 @@ static cl::opt<int> UsePrecDivF32(
" IEEE Compliant F32 div.rnd if avaiable."),
cl::init(2));
+static cl::opt<bool>
+UsePrecSqrtF32("nvptx-prec-sqrtf32",
+ cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
+ cl::init(true));
+
/// createNVPTXISelDag - This pass converts a legalized DAG into a
/// NVPTX-specific DAG, ready for instruction scheduling.
FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
@@ -74,6 +79,8 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
// Decide how to translate f32 div
do_DIVF32_PREC = UsePrecDivF32;
+ // Decide how to translate f32 sqrt
+ do_SQRTF32_PREC = UsePrecSqrtF32;
// sm less than sm_20 does not support div.rnd. Use div.full.
if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20())
do_DIVF32_PREC = 1;
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 70e8e46429..ed16d4450b 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -41,6 +41,10 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
// Otherwise, use div.full
int do_DIVF32_PREC;
+ // If true, generate sqrt.rn, else generate sqrt.approx. If FTZ
+ // is true, then generate the corresponding FTZ version.
+ bool do_SQRTF32_PREC;
+
// If true, add .ftz to f32 instructions.
// This is only meaningful for sm_20 and later, as the default
// is not ftz.
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index f43abe283b..da6dd39b93 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -75,6 +75,9 @@ def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">;
def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">;
def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">;
+def do_SQRTF32_APPROX : Predicate<"do_SQRTF32_PREC==0">;
+def do_SQRTF32_RN : Predicate<"do_SQRTF32_PREC==1">;
+
def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
def true : Predicate<"1">;
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 49e2568dfa..24037cafef 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -512,6 +512,16 @@ def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs,
def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs,
Float64Regs, int_nvvm_sqrt_rp_d>;
+// nvvm_sqrt intrinsic
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+ (INT_NVVM_SQRT_RN_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+ (INT_NVVM_SQRT_RN_F Float32Regs:$a)>, Requires<[do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+ (INT_NVVM_SQRT_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+ (INT_NVVM_SQRT_APPROX_F Float32Regs:$a)>;
+
//
// Rsqrt
//
@@ -1510,38 +1520,12 @@ multiclass G_TO_NG<string Str, Intrinsic Intrin> {
defm cvta_local : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>;
defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen>;
defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen>;
+defm cvta_const : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen>;
defm cvta_to_local : G_TO_NG<"local", int_nvvm_ptr_gen_to_local>;
defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared>;
defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global>;
-
-def cvta_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
- "mov.u32 \t$result, $src;",
- [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen Int32Regs:$src))]>;
-def cvta_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
- "mov.u64 \t$result, $src;",
- [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen Int64Regs:$src))]>;
-
-
-
-// @TODO: Revisit this. There is a type
-// contradiction between iPTRAny and iPTR for the def.
-/*def cvta_const_addr : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
- "mov.u32 \t$result, $src;",
- [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen
- (Wrapper tglobaladdr:$src)))]>;
-def cvta_const_addr_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
- "mov.u64 \t$result, $src;",
- [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen
- (Wrapper tglobaladdr:$src)))]>;*/
-
-
-def cvta_to_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
- "mov.u32 \t$result, $src;",
- [(set Int32Regs:$result, (int_nvvm_ptr_gen_to_constant Int32Regs:$src))]>;
-def cvta_to_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
- "mov.u64 \t$result, $src;",
- [(set Int64Regs:$result, (int_nvvm_ptr_gen_to_constant Int64Regs:$src))]>;
+defm cvta_to_const : G_TO_NG<"const", int_nvvm_ptr_gen_to_constant>;
// nvvm.ptr.gen.to.param
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 67ca6b58e5..1ae2a7cc86 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -49,6 +49,7 @@ using namespace llvm;
namespace llvm {
void initializeNVVMReflectPass(PassRegistry&);
+void initializeGenericToNVVMPass(PassRegistry&);
}
extern "C" void LLVMInitializeNVPTXTarget() {
@@ -62,6 +63,7 @@ extern "C" void LLVMInitializeNVPTXTarget() {
// FIXME: This pass is really intended to be invoked during IR optimization,
// but it's very NVPTX-specific.
initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+ initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());
}
NVPTXTargetMachine::NVPTXTargetMachine(
@@ -100,6 +102,7 @@ public:
return getTM<NVPTXTargetMachine>();
}
+ virtual void addIRPasses();
virtual bool addInstSelector();
virtual bool addPreRegAlloc();
};
@@ -110,6 +113,11 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
return PassConfig;
}
+void NVPTXPassConfig::addIRPasses() {
+ TargetPassConfig::addIRPasses();
+ addPass(createGenericToNVVMPass());
+}
+
bool NVPTXPassConfig::addInstSelector() {
addPass(createLowerAggrCopies());
addPass(createSplitBBatBarPass());
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 0ad62ce39b..3cc324b85e 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -14,6 +14,7 @@
//
//===----------------------------------------------------------------------===//
+#include "NVPTX.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringMap.h"
@@ -40,7 +41,7 @@ using namespace llvm;
namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
namespace {
-class LLVM_LIBRARY_VISIBILITY NVVMReflect : public ModulePass {
+class NVVMReflect : public ModulePass {
private:
StringMap<int> VarMap;
typedef DenseMap<std::string, int>::iterator VarMapIter;
@@ -48,9 +49,18 @@ private:
public:
static char ID;
- NVVMReflect() : ModulePass(ID) {
+ NVVMReflect() : ModulePass(ID), ReflectFunction(0) {
+ initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
VarMap.clear();
- ReflectFunction = 0;
+ }
+
+ NVVMReflect(const StringMap<int> &Mapping)
+ : ModulePass(ID), ReflectFunction(0) {
+ initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+ for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end();
+ I != E; ++I) {
+ VarMap[(*I).getKey()] = (*I).getValue();
+ }
}
void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); }
@@ -60,6 +70,14 @@ public:
};
}
+ModulePass *llvm::createNVVMReflectPass() {
+ return new NVVMReflect();
+}
+
+ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping) {
+ return new NVVMReflect(Mapping);
+}
+
static cl::opt<bool>
NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true),
cl::desc("NVVM reflection, enabled by default"));
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index bd1c378681..3e608ca8f6 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -112,15 +112,21 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
unsigned MBBStartOffset = 0;
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {
- if (I->getOpcode() != PPC::BCC || I->getOperand(2).isImm()) {
+ MachineBasicBlock *Dest = 0;
+ if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm())
+ Dest = I->getOperand(2).getMBB();
+ else if ((I->getOpcode() == PPC::BDNZ8 || I->getOpcode() == PPC::BDNZ ||
+ I->getOpcode() == PPC::BDZ8 || I->getOpcode() == PPC::BDZ) &&
+ !I->getOperand(0).isImm())
+ Dest = I->getOperand(0).getMBB();
+
+ if (!Dest) {
MBBStartOffset += TII->GetInstSizeInBytes(I);
continue;
}
// Determine the offset from the current branch to the destination
// block.
- MachineBasicBlock *Dest = I->getOperand(2).getMBB();
-
int BranchSize;
if (Dest->getNumber() <= MBB.getNumber()) {
// If this is a backwards branch, the delta is the offset from the
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index b44d2482d5..d2620b2877 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -284,6 +284,17 @@ class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
(BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
>;
+// Bitfield extract patterns
+
+def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>;
+def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}],
+ SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>;
+
+class BFEPattern <Instruction BFE> : Pat <
+ (and (srl i32:$x, legalshift32:$y), bfemask:$z),
+ (BFE $x, $y, $z)
+>;
+
include "R600Instructions.td"
include "SIInstrInfo.td"
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 0ec67ce13b..31fbf32d0c 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -115,7 +115,6 @@ AMDGPUPassConfig::addPreISel() {
}
bool AMDGPUPassConfig::addInstSelector() {
- addPass(createAMDGPUPeepholeOpt(*TM));
addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
diff --git a/lib/Target/R600/AMDILDeviceInfo.cpp b/lib/Target/R600/AMDILDeviceInfo.cpp
index 178795936a..126514b976 100644
--- a/lib/Target/R600/AMDILDeviceInfo.cpp
+++ b/lib/Target/R600/AMDILDeviceInfo.cpp
@@ -81,7 +81,8 @@ AMDGPUDevice* getDeviceFromName(const std::string &deviceName,
return new AMDGPUNIDevice(ptr);
} else if (deviceName == "SI" ||
deviceName == "tahiti" || deviceName == "pitcairn" ||
- deviceName == "verde" || deviceName == "oland") {
+ deviceName == "verde" || deviceName == "oland" ||
+ deviceName == "hainan") {
return new AMDGPUSIDevice(ptr);
} else {
#if DEBUG
diff --git a/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/lib/Target/R600/AMDILPeepholeOptimizer.cpp
deleted file mode 100644
index 3a28038666..0000000000
--- a/lib/Target/R600/AMDILPeepholeOptimizer.cpp
+++ /dev/null
@@ -1,1215 +0,0 @@
-//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "PeepholeOpt"
-#ifdef DEBUG
-#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
-#else
-#define DEBUGME 0
-#endif
-
-#include "AMDILDevices.h"
-#include "AMDGPUInstrInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-
-#include <sstream>
-
-#if 0
-STATISTIC(PointerAssignments, "Number of dynamic pointer "
- "assigments discovered");
-STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
-#endif
-
-using namespace llvm;
-// The Peephole optimization pass is used to do simple last minute optimizations
-// that are required for correct code or to remove redundant functions
-namespace {
-
-class OpaqueType;
-
-class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
-public:
- TargetMachine &TM;
- static char ID;
- AMDGPUPeepholeOpt(TargetMachine &tm);
- ~AMDGPUPeepholeOpt();
- const char *getPassName() const;
- bool runOnFunction(Function &F);
- bool doInitialization(Module &M);
- bool doFinalization(Module &M);
- void getAnalysisUsage(AnalysisUsage &AU) const;
-protected:
-private:
- // Function to initiate all of the instruction level optimizations.
- bool instLevelOptimizations(BasicBlock::iterator *inst);
- // Quick check to see if we need to dump all of the pointers into the
- // arena. If this is correct, then we set all pointers to exist in arena. This
- // is a workaround for aliasing of pointers in a struct/union.
- bool dumpAllIntoArena(Function &F);
- // Because I don't want to invalidate any pointers while in the
- // safeNestedForEachFunction. I push atomic conversions to a vector and handle
- // it later. This function does the conversions if required.
- void doAtomicConversionIfNeeded(Function &F);
- // Because __amdil_is_constant cannot be properly evaluated if
- // optimizations are disabled, the call's are placed in a vector
- // and evaluated after the __amdil_image* functions are evaluated
- // which should allow the __amdil_is_constant function to be
- // evaluated correctly.
- void doIsConstCallConversionIfNeeded();
- bool mChanged;
- bool mDebug;
- bool mConvertAtomics;
- CodeGenOpt::Level optLevel;
- // Run a series of tests to see if we can optimize a CALL instruction.
- bool optimizeCallInst(BasicBlock::iterator *bbb);
- // A peephole optimization to optimize bit extract sequences.
- bool optimizeBitExtract(Instruction *inst);
- // A peephole optimization to optimize bit insert sequences.
- bool optimizeBitInsert(Instruction *inst);
- bool setupBitInsert(Instruction *base,
- Instruction *&src,
- Constant *&mask,
- Constant *&shift);
- // Expand the bit field insert instruction on versions of OpenCL that
- // don't support it.
- bool expandBFI(CallInst *CI);
- // Expand the bit field mask instruction on version of OpenCL that
- // don't support it.
- bool expandBFM(CallInst *CI);
- // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
- // this case we need to expand them. These functions check for 24bit functions
- // and then expand.
- bool isSigned24BitOps(CallInst *CI);
- void expandSigned24BitOps(CallInst *CI);
- // One optimization that can occur is that if the required workgroup size is
- // specified then the result of get_local_size is known at compile time and
- // can be returned accordingly.
- bool isRWGLocalOpt(CallInst *CI);
- // On northern island cards, the division is slightly less accurate than on
- // previous generations, so we need to utilize a more accurate division. So we
- // can translate the accurate divide to a normal divide on all other cards.
- bool convertAccurateDivide(CallInst *CI);
- void expandAccurateDivide(CallInst *CI);
- // If the alignment is set incorrectly, it can produce really inefficient
- // code. This checks for this scenario and fixes it if possible.
- bool correctMisalignedMemOp(Instruction *inst);
-
- // If we are in no opt mode, then we need to make sure that
- // local samplers are properly propagated as constant propagation
- // doesn't occur and we need to know the value of kernel defined
- // samplers at compile time.
- bool propagateSamplerInst(CallInst *CI);
-
- // Helper functions
-
- // Group of functions that recursively calculate the size of a structure based
- // on it's sub-types.
- size_t getTypeSize(Type * const T, bool dereferencePtr = false);
- size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
- size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
- size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
- size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
- size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
- size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
- size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
-
- LLVMContext *mCTX;
- Function *mF;
- const AMDGPUSubtarget *mSTM;
- SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
- SmallVector<CallInst *, 16> isConstVec;
-}; // class AMDGPUPeepholeOpt
- char AMDGPUPeepholeOpt::ID = 0;
-
-// A template function that has two levels of looping before calling the
-// function with a pointer to the current iterator.
-template<class InputIterator, class SecondIterator, class Function>
-Function safeNestedForEach(InputIterator First, InputIterator Last,
- SecondIterator S, Function F) {
- for ( ; First != Last; ++First) {
- SecondIterator sf, sl;
- for (sf = First->begin(), sl = First->end();
- sf != sl; ) {
- if (!F(&sf)) {
- ++sf;
- }
- }
- }
- return F;
-}
-
-} // anonymous namespace
-
-namespace llvm {
- FunctionPass *
- createAMDGPUPeepholeOpt(TargetMachine &tm) {
- return new AMDGPUPeepholeOpt(tm);
- }
-} // llvm namespace
-
-AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
- : FunctionPass(ID), TM(tm) {
- mDebug = DEBUGME;
- optLevel = TM.getOptLevel();
-
-}
-
-AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt() {
-}
-
-const char *
-AMDGPUPeepholeOpt::getPassName() const {
- return "AMDGPU PeepHole Optimization Pass";
-}
-
-bool
-containsPointerType(Type *Ty) {
- if (!Ty) {
- return false;
- }
- switch(Ty->getTypeID()) {
- default:
- return false;
- case Type::StructTyID: {
- const StructType *ST = dyn_cast<StructType>(Ty);
- for (StructType::element_iterator stb = ST->element_begin(),
- ste = ST->element_end(); stb != ste; ++stb) {
- if (!containsPointerType(*stb)) {
- continue;
- }
- return true;
- }
- break;
- }
- case Type::VectorTyID:
- case Type::ArrayTyID:
- return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
- case Type::PointerTyID:
- return true;
- };
- return false;
-}
-
-bool
-AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F) {
- bool dumpAll = false;
- for (Function::const_arg_iterator cab = F.arg_begin(),
- cae = F.arg_end(); cab != cae; ++cab) {
- const Argument *arg = cab;
- const PointerType *PT = dyn_cast<PointerType>(arg->getType());
- if (!PT) {
- continue;
- }
- Type *DereferencedType = PT->getElementType();
- if (!dyn_cast<StructType>(DereferencedType)
- ) {
- continue;
- }
- if (!containsPointerType(DereferencedType)) {
- continue;
- }
- // FIXME: Because a pointer inside of a struct/union may be aliased to
- // another pointer we need to take the conservative approach and place all
- // pointers into the arena until more advanced detection is implemented.
- dumpAll = true;
- }
- return dumpAll;
-}
-void
-AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
- if (isConstVec.empty()) {
- return;
- }
- for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
- CallInst *CI = isConstVec[x];
- Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
- Type *aType = Type::getInt32Ty(*mCTX);
- Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
- : ConstantInt::get(aType, 0);
- CI->replaceAllUsesWith(Val);
- CI->eraseFromParent();
- }
- isConstVec.clear();
-}
-void
-AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F) {
- // Don't do anything if we don't have any atomic operations.
- if (atomicFuncs.empty()) {
- return;
- }
- // Change the function name for the atomic if it is required
- uint32_t size = atomicFuncs.size();
- for (uint32_t x = 0; x < size; ++x) {
- atomicFuncs[x].first->setOperand(
- atomicFuncs[x].first->getNumOperands()-1,
- atomicFuncs[x].second);
-
- }
- mChanged = true;
- if (mConvertAtomics) {
- return;
- }
-}
-
-bool
-AMDGPUPeepholeOpt::runOnFunction(Function &MF) {
- mChanged = false;
- mF = &MF;
- mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
- if (mDebug) {
- MF.dump();
- }
- mCTX = &MF.getType()->getContext();
- mConvertAtomics = true;
- safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
- std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
- this));
-
- doAtomicConversionIfNeeded(MF);
- doIsConstCallConversionIfNeeded();
-
- if (mDebug) {
- MF.dump();
- }
- return mChanged;
-}
-
-bool
-AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb) {
- Instruction *inst = (*bbb);
- CallInst *CI = dyn_cast<CallInst>(inst);
- if (!CI) {
- return false;
- }
- if (isSigned24BitOps(CI)) {
- expandSigned24BitOps(CI);
- ++(*bbb);
- CI->eraseFromParent();
- return true;
- }
- if (propagateSamplerInst(CI)) {
- return false;
- }
- if (expandBFI(CI) || expandBFM(CI)) {
- ++(*bbb);
- CI->eraseFromParent();
- return true;
- }
- if (convertAccurateDivide(CI)) {
- expandAccurateDivide(CI);
- ++(*bbb);
- CI->eraseFromParent();
- return true;
- }
-
- StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
- if (calleeName.startswith("__amdil_is_constant")) {
- // If we do not have optimizations, then this
- // cannot be properly evaluated, so we add the
- // call instruction to a vector and process
- // them at the end of processing after the
- // samplers have been correctly handled.
- if (optLevel == CodeGenOpt::None) {
- isConstVec.push_back(CI);
- return false;
- } else {
- Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
- Type *aType = Type::getInt32Ty(*mCTX);
- Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
- : ConstantInt::get(aType, 0);
- CI->replaceAllUsesWith(Val);
- ++(*bbb);
- CI->eraseFromParent();
- return true;
- }
- }
-
- if (calleeName.equals("__amdil_is_asic_id_i32")) {
- ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
- Type *aType = Type::getInt32Ty(*mCTX);
- Value *Val = CV;
- if (Val) {
- Val = ConstantInt::get(aType,
- mSTM->device()->getDeviceFlag() & CV->getZExtValue());
- } else {
- Val = ConstantInt::get(aType, 0);
- }
- CI->replaceAllUsesWith(Val);
- ++(*bbb);
- CI->eraseFromParent();
- return true;
- }
- Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
- if (!F) {
- return false;
- }
- if (F->getName().startswith("__atom") && !CI->getNumUses()
- && F->getName().find("_xchg") == StringRef::npos) {
- std::string buffer(F->getName().str() + "_noret");
- F = dyn_cast<Function>(
- F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
- atomicFuncs.push_back(std::make_pair(CI, F));
- }
-
- if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
- && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
- return false;
- }
- if (!mConvertAtomics) {
- return false;
- }
- StringRef name = F->getName();
- if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
- mConvertAtomics = false;
- }
- return false;
-}
-
-bool
-AMDGPUPeepholeOpt::setupBitInsert(Instruction *base,
- Instruction *&src,
- Constant *&mask,
- Constant *&shift) {
- if (!base) {
- if (mDebug) {
- dbgs() << "Null pointer passed into function.\n";
- }
- return false;
- }
- bool andOp = false;
- if (base->getOpcode() == Instruction::Shl) {
- shift = dyn_cast<Constant>(base->getOperand(1));
- } else if (base->getOpcode() == Instruction::And) {
- mask = dyn_cast<Constant>(base->getOperand(1));
- andOp = true;
- } else {
- if (mDebug) {
- dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
- }
- // If the base is neither a Shl or a And, we don't fit any of the patterns above.
- return false;
- }
- src = dyn_cast<Instruction>(base->getOperand(0));
- if (!src) {
- if (mDebug) {
- dbgs() << "Failed setup since the base operand is not an instruction!\n";
- }
- return false;
- }
- // If we find an 'and' operation, then we don't need to
- // find the next operation as we already know the
- // bits that are valid at this point.
- if (andOp) {
- return true;
- }
- if (src->getOpcode() == Instruction::Shl && !shift) {
- shift = dyn_cast<Constant>(src->getOperand(1));
- src = dyn_cast<Instruction>(src->getOperand(0));
- } else if (src->getOpcode() == Instruction::And && !mask) {
- mask = dyn_cast<Constant>(src->getOperand(1));
- }
- if (!mask && !shift) {
- if (mDebug) {
- dbgs() << "Failed setup since both mask and shift are NULL!\n";
- }
- // Did not find a constant mask or a shift.
- return false;
- }
- return true;
-}
-bool
-AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst) {
- if (!inst) {
- return false;
- }
- if (!inst->isBinaryOp()) {
- return false;
- }
- if (inst->getOpcode() != Instruction::Or) {
- return false;
- }
- if (optLevel == CodeGenOpt::None) {
- return false;
- }
- // We want to do an optimization on a sequence of ops that in the end equals a
- // single ISA instruction.
- // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
- // Some simplified versions of this pattern are as follows:
- // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
- // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
- // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
- // (A & B) | (D << F) when (1 << F) >= B
- // (A << C) | (D & E) when (1 << C) >= E
- if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
- // The HD4XXX hardware doesn't support the ubit_insert instruction.
- return false;
- }
- Type *aType = inst->getType();
- bool isVector = aType->isVectorTy();
- int numEle = 1;
- // This optimization only works on 32bit integers.
- if (aType->getScalarType()
- != Type::getInt32Ty(inst->getContext())) {
- return false;
- }
- if (isVector) {
- const VectorType *VT = dyn_cast<VectorType>(aType);
- numEle = VT->getNumElements();
- // We currently cannot support more than 4 elements in a intrinsic and we
- // cannot support Vec3 types.
- if (numEle > 4 || numEle == 3) {
- return false;
- }
- }
- // TODO: Handle vectors.
- if (isVector) {
- if (mDebug) {
- dbgs() << "!!! Vectors are not supported yet!\n";
- }
- return false;
- }
- Instruction *LHSSrc = NULL, *RHSSrc = NULL;
- Constant *LHSMask = NULL, *RHSMask = NULL;
- Constant *LHSShift = NULL, *RHSShift = NULL;
- Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
- Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
- if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
- if (mDebug) {
- dbgs() << "Found an OR Operation that failed setup!\n";
- inst->dump();
- if (LHS) { LHS->dump(); }
- if (LHSSrc) { LHSSrc->dump(); }
- if (LHSMask) { LHSMask->dump(); }
- if (LHSShift) { LHSShift->dump(); }
- }
- // There was an issue with the setup for BitInsert.
- return false;
- }
- if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
- if (mDebug) {
- dbgs() << "Found an OR Operation that failed setup!\n";
- inst->dump();
- if (RHS) { RHS->dump(); }
- if (RHSSrc) { RHSSrc->dump(); }
- if (RHSMask) { RHSMask->dump(); }
- if (RHSShift) { RHSShift->dump(); }
- }
- // There was an issue with the setup for BitInsert.
- return false;
- }
- if (mDebug) {
- dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
- dbgs() << "Op: "; inst->dump();
- dbgs() << "LHS: "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "LHS Src: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "LHS Mask: "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "RHS: "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "RHS Src: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "RHS Mask: "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
- dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
- }
- Constant *offset = NULL;
- Constant *width = NULL;
- uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
- uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
- uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
- uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
- lhsMaskVal = (LHSMask
- ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
- rhsMaskVal = (RHSMask
- ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
- lhsShiftVal = (LHSShift
- ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
- rhsShiftVal = (RHSShift
- ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
- lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
- rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
- lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
- rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
- // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
- if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
- return false;
- }
- if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
- offset = ConstantInt::get(aType, lhsMaskOffset, false);
- width = ConstantInt::get(aType, lhsMaskWidth, false);
- RHSSrc = RHS;
- if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
- return false;
- }
- if (!LHSShift) {
- LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
- "MaskShr", LHS);
- } else if (lhsShiftVal != lhsMaskOffset) {
- LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
- "MaskShr", LHS);
- }
- if (mDebug) {
- dbgs() << "Optimizing LHS!\n";
- }
- } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
- offset = ConstantInt::get(aType, rhsMaskOffset, false);
- width = ConstantInt::get(aType, rhsMaskWidth, false);
- LHSSrc = RHSSrc;
- RHSSrc = LHS;
- if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
- return false;
- }
- if (!RHSShift) {
- LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
- "MaskShr", RHS);
- } else if (rhsShiftVal != rhsMaskOffset) {
- LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
- "MaskShr", RHS);
- }
- if (mDebug) {
- dbgs() << "Optimizing RHS!\n";
- }
- } else {
- if (mDebug) {
- dbgs() << "Failed constraint 3!\n";
- }
- return false;
- }
- if (mDebug) {
- dbgs() << "Width: "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
- dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
- dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
- dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
- }
- if (!offset || !width) {
- if (mDebug) {
- dbgs() << "Either width or offset are NULL, failed detection!\n";
- }
- return false;
- }
- // Lets create the function signature.
- std::vector<Type *> callTypes;
- callTypes.push_back(aType);
- callTypes.push_back(aType);
- callTypes.push_back(aType);
- callTypes.push_back(aType);
- FunctionType *funcType = FunctionType::get(aType, callTypes, false);
- std::string name = "__amdil_ubit_insert";
- if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
- Function *Func =
- dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
- getOrInsertFunction(StringRef(name), funcType));
- Value *Operands[4] = {
- width,
- offset,
- LHSSrc,
- RHSSrc
- };
- CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
- if (mDebug) {
- dbgs() << "Old Inst: ";
- inst->dump();
- dbgs() << "New Inst: ";
- CI->dump();
- dbgs() << "\n\n";
- }
- CI->insertBefore(inst);
- inst->replaceAllUsesWith(CI);
- return true;
-}
-
-bool
-AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst) {
- if (!inst) {
- return false;
- }
- if (!inst->isBinaryOp()) {
- return false;
- }
- if (inst->getOpcode() != Instruction::And) {
- return false;
- }
- if (optLevel == CodeGenOpt::None) {
- return false;
- }
- // We want to do some simple optimizations on Shift right/And patterns. The
- // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
- // value smaller than 32 and C is a mask. If C is a constant value, then the
- // following transformation can occur. For signed integers, it turns into the
- // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
- // integers, it turns into the function call dst =
- // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
- // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
- // Evergreen hardware.
- if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
- // This does not work on HD4XXX hardware.
- return false;
- }
- Type *aType = inst->getType();
- bool isVector = aType->isVectorTy();
-
- // XXX Support vector types
- if (isVector) {
- return false;
- }
- int numEle = 1;
- // This only works on 32bit integers
- if (aType->getScalarType()
- != Type::getInt32Ty(inst->getContext())) {
- return false;
- }
- if (isVector) {
- const VectorType *VT = dyn_cast<VectorType>(aType);
- numEle = VT->getNumElements();
- // We currently cannot support more than 4 elements in a intrinsic and we
- // cannot support Vec3 types.
- if (numEle > 4 || numEle == 3) {
- return false;
- }
- }
- BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
- // If the first operand is not a shift instruction, then we can return as it
- // doesn't match this pattern.
- if (!ShiftInst || !ShiftInst->isShift()) {
- return false;
- }
- // If we are a shift left, then we need don't match this pattern.
- if (ShiftInst->getOpcode() == Instruction::Shl) {
- return false;
- }
- bool isSigned = ShiftInst->isArithmeticShift();
- Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
- Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
- // Lets make sure that the shift value and the and mask are constant integers.
- if (!AndMask || !ShrVal) {
- return false;
- }
- Constant *newMaskConst;
- Constant *shiftValConst;
- if (isVector) {
- // Handle the vector case
- std::vector<Constant *> maskVals;
- std::vector<Constant *> shiftVals;
- ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
- ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
- Type *scalarType = AndMaskVec->getType()->getScalarType();
- assert(AndMaskVec->getNumOperands() ==
- ShrValVec->getNumOperands() && "cannot have a "
- "combination where the number of elements to a "
- "shift and an and are different!");
- for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
- ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
- ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
- if (!AndCI || !ShiftIC) {
- return false;
- }
- uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
- if (!isMask_32(maskVal)) {
- return false;
- }
- maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
- uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
- // If the mask or shiftval is greater than the bitcount, then break out.
- if (maskVal >= 32 || shiftVal >= 32) {
- return false;
- }
- // If the mask val is greater than the the number of original bits left
- // then this optimization is invalid.
- if (maskVal > (32 - shiftVal)) {
- return false;
- }
- maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
- shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
- }
- newMaskConst = ConstantVector::get(maskVals);
- shiftValConst = ConstantVector::get(shiftVals);
- } else {
- // Handle the scalar case
- uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
- // This must be a mask value where all lower bits are set to 1 and then any
- // bit higher is set to 0.
- if (!isMask_32(maskVal)) {
- return false;
- }
- maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
- // Count the number of bits set in the mask, this is the width of the
- // resulting bit set that is extracted from the source value.
- uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
- // If the mask or shift val is greater than the bitcount, then break out.
- if (maskVal >= 32 || shiftVal >= 32) {
- return false;
- }
- // If the mask val is greater than the the number of original bits left then
- // this optimization is invalid.
- if (maskVal > (32 - shiftVal)) {
- return false;
- }
- newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
- shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
- }
- // Lets create the function signature.
- std::vector<Type *> callTypes;
- callTypes.push_back(aType);
- callTypes.push_back(aType);
- callTypes.push_back(aType);
- FunctionType *funcType = FunctionType::get(aType, callTypes, false);
- std::string name = "llvm.AMDGPU.bit.extract.u32";
- if (isVector) {
- name += ".v" + itostr(numEle) + "i32";
- } else {
- name += ".";
- }
- // Lets create the function.
- Function *Func =
- dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
- getOrInsertFunction(StringRef(name), funcType));
- Value *Operands[3] = {
- ShiftInst->getOperand(0),
- shiftValConst,
- newMaskConst
- };
- // Lets create the Call with the operands
- CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
- CI->setDoesNotAccessMemory();
- CI->insertBefore(inst);
- inst->replaceAllUsesWith(CI);
- return true;
-}
-
-bool
-AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
- if (!CI) {
- return false;
- }
- Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
- if (!LHS->getName().startswith("__amdil_bfi")) {
- return false;
- }
- Type* type = CI->getOperand(0)->getType();
- Constant *negOneConst = NULL;
- if (type->isVectorTy()) {
- std::vector<Constant *> negOneVals;
- negOneConst = ConstantInt::get(CI->getContext(),
- APInt(32, StringRef("-1"), 10));
- for (size_t x = 0,
- y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
- negOneVals.push_back(negOneConst);
- }
- negOneConst = ConstantVector::get(negOneVals);
- } else {
- negOneConst = ConstantInt::get(CI->getContext(),
- APInt(32, StringRef("-1"), 10));
- }
- // __amdil_bfi => (A & B) | (~A & C)
- BinaryOperator *lhs =
- BinaryOperator::Create(Instruction::And, CI->getOperand(0),
- CI->getOperand(1), "bfi_and", CI);
- BinaryOperator *rhs =
- BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
- "bfi_not", CI);
- rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
- "bfi_and", CI);
- lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
- CI->replaceAllUsesWith(lhs);
- return true;
-}
-
-bool
-AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
- if (!CI) {
- return false;
- }
- Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
- if (!LHS->getName().startswith("__amdil_bfm")) {
- return false;
- }
- // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
- Constant *newMaskConst = NULL;
- Constant *newShiftConst = NULL;
- Type* type = CI->getOperand(0)->getType();
- if (type->isVectorTy()) {
- std::vector<Constant*> newMaskVals, newShiftVals;
- newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
- newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
- for (size_t x = 0,
- y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
- newMaskVals.push_back(newMaskConst);
- newShiftVals.push_back(newShiftConst);
- }
- newMaskConst = ConstantVector::get(newMaskVals);
- newShiftConst = ConstantVector::get(newShiftVals);
- } else {
- newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
- newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
- }
- BinaryOperator *lhs =
- BinaryOperator::Create(Instruction::And, CI->getOperand(0),
- newMaskConst, "bfm_mask", CI);
- lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
- lhs, "bfm_shl", CI);
- lhs = BinaryOperator::Create(Instruction::Sub, lhs,
- newShiftConst, "bfm_sub", CI);
- BinaryOperator *rhs =
- BinaryOperator::Create(Instruction::And, CI->getOperand(1),
- newMaskConst, "bfm_mask", CI);
- lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
- CI->replaceAllUsesWith(lhs);
- return true;
-}
-
-bool
-AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb) {
- Instruction *inst = (*bbb);
- if (optimizeCallInst(bbb)) {
- return true;
- }
- if (optimizeBitExtract(inst)) {
- return false;
- }
- if (optimizeBitInsert(inst)) {
- return false;
- }
- if (correctMisalignedMemOp(inst)) {
- return false;
- }
- return false;
-}
-bool
-AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
- LoadInst *linst = dyn_cast<LoadInst>(inst);
- StoreInst *sinst = dyn_cast<StoreInst>(inst);
- unsigned alignment;
- Type* Ty = inst->getType();
- if (linst) {
- alignment = linst->getAlignment();
- Ty = inst->getType();
- } else if (sinst) {
- alignment = sinst->getAlignment();
- Ty = sinst->getValueOperand()->getType();
- } else {
- return false;
- }
- unsigned size = getTypeSize(Ty);
- if (size == alignment || size < alignment) {
- return false;
- }
- if (!Ty->isStructTy()) {
- return false;
- }
- if (alignment < 4) {
- if (linst) {
- linst->setAlignment(0);
- return true;
- } else if (sinst) {
- sinst->setAlignment(0);
- return true;
- }
- }
- return false;
-}
-bool
-AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI) {
- if (!CI) {
- return false;
- }
- Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
- std::string namePrefix = LHS->getName().substr(0, 14);
- if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
- && namePrefix != "__amdil__imul24_high") {
- return false;
- }
- if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
- return false;
- }
- return true;
-}
-
-void
-AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI) {
- assert(isSigned24BitOps(CI) && "Must be a "
- "signed 24 bit operation to call this function!");
- Value *LHS = CI->getOperand(CI->getNumOperands()-1);
- // On 7XX and 8XX we do not have signed 24bit, so we need to
- // expand it to the following:
- // imul24 turns into 32bit imul
- // imad24 turns into 32bit imad
- // imul24_high turns into 32bit imulhigh
- if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
- Type *aType = CI->getOperand(0)->getType();
- bool isVector = aType->isVectorTy();
- int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
- std::vector<Type*> callTypes;
- callTypes.push_back(CI->getOperand(0)->getType());
- callTypes.push_back(CI->getOperand(1)->getType());
- callTypes.push_back(CI->getOperand(2)->getType());
- FunctionType *funcType =
- FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
- std::string name = "__amdil_imad";
- if (isVector) {
- name += "_v" + itostr(numEle) + "i32";
- } else {
- name += "_i32";
- }
- Function *Func = dyn_cast<Function>(
- CI->getParent()->getParent()->getParent()->
- getOrInsertFunction(StringRef(name), funcType));
- Value *Operands[3] = {
- CI->getOperand(0),
- CI->getOperand(1),
- CI->getOperand(2)
- };
- CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
- nCI->insertBefore(CI);
- CI->replaceAllUsesWith(nCI);
- } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
- BinaryOperator *mulOp =
- BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
- CI->getOperand(1), "imul24", CI);
- CI->replaceAllUsesWith(mulOp);
- } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
- Type *aType = CI->getOperand(0)->getType();
-
- bool isVector = aType->isVectorTy();
- int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
- std::vector<Type*> callTypes;
- callTypes.push_back(CI->getOperand(0)->getType());
- callTypes.push_back(CI->getOperand(1)->getType());
- FunctionType *funcType =
- FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
- std::string name = "__amdil_imul_high";
- if (isVector) {
- name += "_v" + itostr(numEle) + "i32";
- } else {
- name += "_i32";
- }
- Function *Func = dyn_cast<Function>(
- CI->getParent()->getParent()->getParent()->
- getOrInsertFunction(StringRef(name), funcType));
- Value *Operands[2] = {
- CI->getOperand(0),
- CI->getOperand(1)
- };
- CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
- nCI->insertBefore(CI);
- CI->replaceAllUsesWith(nCI);
- }
-}
-
-bool
-AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI) {
- return (CI != NULL
- && CI->getOperand(CI->getNumOperands() - 1)->getName()
- == "__amdil_get_local_size_int");
-}
-
-bool
-AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI) {
- if (!CI) {
- return false;
- }
- if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
- && (mSTM->getDeviceName() == "cayman")) {
- return false;
- }
- return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20)
- == "__amdil_improved_div";
-}
-
-void
-AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI) {
- assert(convertAccurateDivide(CI)
- && "expanding accurate divide can only happen if it is expandable!");
- BinaryOperator *divOp =
- BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
- CI->getOperand(1), "fdiv32", CI);
- CI->replaceAllUsesWith(divOp);
-}
-
-bool
-AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
- if (optLevel != CodeGenOpt::None) {
- return false;
- }
-
- if (!CI) {
- return false;
- }
-
- unsigned funcNameIdx = 0;
- funcNameIdx = CI->getNumOperands() - 1;
- StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
- if (calleeName != "__amdil_image2d_read_norm"
- && calleeName != "__amdil_image2d_read_unnorm"
- && calleeName != "__amdil_image3d_read_norm"
- && calleeName != "__amdil_image3d_read_unnorm") {
- return false;
- }
-
- unsigned samplerIdx = 2;
- samplerIdx = 1;
- Value *sampler = CI->getOperand(samplerIdx);
- LoadInst *lInst = dyn_cast<LoadInst>(sampler);
- if (!lInst) {
- return false;
- }
-
- if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
- return false;
- }
-
- GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
- // If we are loading from what is not a global value, then we
- // fail and return.
- if (!gv) {
- return false;
- }
-
- // If we don't have an initializer or we have an initializer and
- // the initializer is not a 32bit integer, we fail.
- if (!gv->hasInitializer()
- || !gv->getInitializer()->getType()->isIntegerTy(32)) {
- return false;
- }
-
- // Now that we have the global variable initializer, lets replace
- // all uses of the load instruction with the samplerVal and
- // reparse the __amdil_is_constant() function.
- Constant *samplerVal = gv->getInitializer();
- lInst->replaceAllUsesWith(samplerVal);
- return true;
-}
-
-bool
-AMDGPUPeepholeOpt::doInitialization(Module &M) {
- return false;
-}
-
-bool
-AMDGPUPeepholeOpt::doFinalization(Module &M) {
- return false;
-}
-
-void
-AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<MachineFunctionAnalysis>();
- FunctionPass::getAnalysisUsage(AU);
- AU.setPreservesAll();
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
- size_t size = 0;
- if (!T) {
- return size;
- }
- switch (T->getTypeID()) {
- case Type::X86_FP80TyID:
- case Type::FP128TyID:
- case Type::PPC_FP128TyID:
- case Type::LabelTyID:
- assert(0 && "These types are not supported by this backend");
- default:
- case Type::FloatTyID:
- case Type::DoubleTyID:
- size = T->getPrimitiveSizeInBits() >> 3;
- break;
- case Type::PointerTyID:
- size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
- break;
- case Type::IntegerTyID:
- size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
- break;
- case Type::StructTyID:
- size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
- break;
- case Type::ArrayTyID:
- size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
- break;
- case Type::FunctionTyID:
- size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
- break;
- case Type::VectorTyID:
- size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
- break;
- };
- return size;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
- bool dereferencePtr) {
- size_t size = 0;
- if (!ST) {
- return size;
- }
- Type *curType;
- StructType::element_iterator eib;
- StructType::element_iterator eie;
- for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
- curType = *eib;
- size += getTypeSize(curType, dereferencePtr);
- }
- return size;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
- bool dereferencePtr) {
- return IT ? (IT->getBitWidth() >> 3) : 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
- bool dereferencePtr) {
- assert(0 && "Should not be able to calculate the size of an function type");
- return 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
- bool dereferencePtr) {
- return (size_t)(AT ? (getTypeSize(AT->getElementType(),
- dereferencePtr) * AT->getNumElements())
- : 0);
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
- bool dereferencePtr) {
- return VT ? (VT->getBitWidth() >> 3) : 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
- bool dereferencePtr) {
- if (!PT) {
- return 0;
- }
- Type *CT = PT->getElementType();
- if (CT->getTypeID() == Type::StructTyID &&
- PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
- return getTypeSize(dyn_cast<StructType>(CT));
- } else if (dereferencePtr) {
- size_t size = 0;
- for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
- size += getTypeSize(PT->getContainedType(x), dereferencePtr);
- }
- return size;
- } else {
- return 4;
- }
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
- bool dereferencePtr) {
- //assert(0 && "Should not be able to calculate the size of an opaque type");
- return 4;
-}
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 2ad2047278..97f0a40c29 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -21,7 +21,6 @@ add_llvm_target(R600CodeGen
AMDILISelDAGToDAG.cpp
AMDILISelLowering.cpp
AMDILNIDevice.cpp
- AMDILPeepholeOptimizer.cpp
AMDILSIDevice.cpp
AMDGPUAsmPrinter.cpp
AMDGPUFrameLowering.cpp
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 45d009c2a0..61d70bb342 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -78,7 +78,7 @@ static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII,
if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) {
return createSIMCCodeEmitter(MCII, MRI, STI, Ctx);
} else {
- return createR600MCCodeEmitter(MCII, MRI, STI, Ctx);
+ return createR600MCCodeEmitter(MCII, MRI, STI);
}
}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
index 09d0d5b61c..abb032045b 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -33,8 +33,7 @@ extern Target TheAMDGPUTarget;
MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
- const MCSubtargetInfo &STI,
- MCContext &Ctx);
+ const MCSubtargetInfo &STI);
MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 271a974734..cb4cf0ce38 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -36,13 +36,12 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
const MCInstrInfo &MCII;
const MCRegisterInfo &MRI;
const MCSubtargetInfo &STI;
- MCContext &Ctx;
public:
R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
- const MCSubtargetInfo &sti, MCContext &ctx)
- : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { }
+ const MCSubtargetInfo &sti)
+ : MCII(mcii), MRI(mri), STI(sti) { }
/// \brief Encode the instruction and write it to the OS.
virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -99,9 +98,8 @@ enum TextureTypes {
MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
const MCRegisterInfo &MRI,
- const MCSubtargetInfo &STI,
- MCContext &Ctx) {
- return new R600MCCodeEmitter(MCII, MRI, STI, Ctx);
+ const MCSubtargetInfo &STI) {
+ return new R600MCCodeEmitter(MCII, MRI, STI);
}
void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -181,6 +179,13 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
Emit((u_int32_t) 0, OS);
} else {
uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
+ if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
+ ((Desc.TSFlags & R600_InstFlag::OP1) ||
+ Desc.TSFlags & R600_InstFlag::OP2)) {
+ uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
+ Inst &= ~(0x3FFULL << 39);
+ Inst |= ISAOpCode << 1;
+ }
Emit(Inst, OS);
}
}
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index 5ee1c0d8ae..0cbe919d81 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -45,3 +45,4 @@ def : Proc<"tahiti", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
def : Proc<"pitcairn", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
def : Proc<"verde", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
def : Proc<"oland", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
+def : Proc<"hainan", SI_Itin, [Feature64BitPtr, FeatureFP64]>;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index cdda3dab8d..ffe3414413 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -325,7 +325,7 @@ public:
virtual bool runOnMachineFunction(MachineFunction &MF) {
unsigned MaxStack = 0;
unsigned CurrentStack = 0;
- bool hasPush;
+ bool HasPush = false;
for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
++MB) {
MachineBasicBlock &MBB = *MB;
@@ -337,6 +337,7 @@ public:
BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
getHWInstrDesc(CF_CALL_FS));
CfCount++;
+ MaxStack = 1;
}
std::vector<ClauseFile> FetchClauses, AluClauses;
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
@@ -354,7 +355,7 @@ public:
case AMDGPU::CF_ALU_PUSH_BEFORE:
CurrentStack++;
MaxStack = std::max(MaxStack, CurrentStack);
- hasPush = true;
+ HasPush = true;
case AMDGPU::CF_ALU:
I = MI;
AluClauses.push_back(MakeALUClause(MBB, I));
@@ -475,7 +476,7 @@ public:
break;
}
}
- MFI->StackSize = getHWStackSize(MaxStack, hasPush);
+ MFI->StackSize = getHWStackSize(MaxStack, HasPush);
}
return false;
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index c6e2136ff4..7252235d5b 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -43,6 +43,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::AND, MVT::v4i32, Expand);
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
+ setOperationAction(ISD::MUL, MVT::v2i32, Expand);
+ setOperationAction(ISD::MUL, MVT::v4i32, Expand);
setOperationAction(ISD::OR, MVT::v4i32, Expand);
setOperationAction(ISD::OR, MVT::v2i32, Expand);
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
@@ -50,6 +52,10 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SHL, MVT::v2i32, Expand);
setOperationAction(ISD::SRL, MVT::v4i32, Expand);
setOperationAction(ISD::SRL, MVT::v2i32, Expand);
+ setOperationAction(ISD::SRA, MVT::v4i32, Expand);
+ setOperationAction(ISD::SRA, MVT::v2i32, Expand);
+ setOperationAction(ISD::SUB, MVT::v4i32, Expand);
+ setOperationAction(ISD::SUB, MVT::v2i32, Expand);
setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
setOperationAction(ISD::UREM, MVT::v4i32, Expand);
@@ -78,6 +84,9 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
setOperationAction(ISD::SELECT, MVT::i32, Custom);
setOperationAction(ISD::SELECT, MVT::f32, Custom);
+ setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
+ setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
+
// Legalize loads and stores to the private address space.
setOperationAction(ISD::LOAD, MVT::i32, Custom);
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 8a60add450..8f47523524 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1615,6 +1615,7 @@ let Predicates = [isEGorCayman] in {
i32:$src2))],
VecALU
>;
+ def : BFEPattern <BFE_UINT_eg>;
def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>;
defm : BFIPatterns <BFI_INT_eg>;
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 16e1e42c50..42b4e73509 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -369,7 +369,14 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
/// getCompactUnwindRegNum - Get the compact unwind number for a given
/// register. The number corresponds to the enum lists in
/// compact_unwind_encoding.h.
-static int getCompactUnwindRegNum(const uint16_t *CURegs, unsigned Reg) {
+static int getCompactUnwindRegNum(unsigned Reg, bool is64Bit) {
+ static const uint16_t CU32BitRegs[] = {
+ X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+ };
+ static const uint16_t CU64BitRegs[] = {
+ X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+ };
+ const uint16_t *CURegs = is64Bit ? CU64BitRegs : CU32BitRegs;
for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
if (*CURegs == Reg)
return Idx;
@@ -398,16 +405,8 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
// 4 3
// 5 3
//
- static const uint16_t CU32BitRegs[] = {
- X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
- };
- static const uint16_t CU64BitRegs[] = {
- X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
- };
- const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
-
for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
- int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]);
+ int CUReg = getCompactUnwindRegNum(SavedRegs[i], Is64Bit);
if (CUReg == -1) return ~0U;
SavedRegs[i] = CUReg;
}
@@ -466,14 +465,6 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
static uint32_t
encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
bool Is64Bit) {
- static const uint16_t CU32BitRegs[] = {
- X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
- };
- static const uint16_t CU64BitRegs[] = {
- X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
- };
- const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
-
// Encode the registers in the order they were saved, 3-bits per register. The
// registers are numbered from 1 to CU_NUM_SAVED_REGS.
uint32_t RegEnc = 0;
@@ -481,7 +472,7 @@ encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
unsigned Reg = SavedRegs[I];
if (Reg == 0) continue;
- int CURegNum = getCompactUnwindRegNum(CURegs, Reg);
+ int CURegNum = getCompactUnwindRegNum(Reg, Is64Bit);
if (CURegNum == -1) return ~0U;
// Encode the 3-bit register number in order, skipping over 3-bits for each
@@ -534,6 +525,12 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
// If there are too many saved registers, we cannot use compact encoding.
if (SavedRegIdx >= CU_NUM_SAVED_REGS) return CU::UNWIND_MODE_DWARF;
+ unsigned Reg = MI.getOperand(0).getReg();
+ if (Reg == (Is64Bit ? X86::RAX : X86::EAX)) {
+ ExpectEnd = true;
+ continue;
+ }
+
SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg();
StackAdjust += OffsetSize;
InstrOffset += PushInstrSize;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 45fe69aa29..f69f5d85f7 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -9336,29 +9336,31 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
if (Swap)
std::swap(Op0, Op1);
- // Since SSE has no unsigned integer comparisons, we need to flip the sign
- // bits of the inputs before performing those operations.
- if (FlipSigns) {
- EVT EltVT = VT.getVectorElementType();
- SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
- EltVT);
- std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
- SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
- SignBits.size());
- Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
- Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
- }
-
// Check that the operation in question is available (most are plain SSE2,
// but PCMPGTQ and PCMPEQQ have different requirements).
if (VT == MVT::v2i64) {
if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
assert(Subtarget->hasSSE2() && "Don't know how to lower!");
- // First cast everything to the right type,
+ // First cast everything to the right type.
Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // bits of the inputs before performing those operations. The lower
+ // compare is always unsigned.
+ SDValue SB;
+ if (FlipSigns) {
+ SB = DAG.getConstant(0x80000000U, MVT::v4i32);
+ } else {
+ SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
+ SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
+ SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+ Sign, Zero, Sign, Zero);
+ }
+ Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
+ Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
+
// Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
@@ -9384,7 +9386,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
// pcmpeqd + pshufd + pand.
assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
- // First cast everything to the right type,
+ // First cast everything to the right type.
Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
@@ -9403,6 +9405,15 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
}
}
+ // Since SSE has no unsigned integer comparisons, we need to flip the sign
+ // bits of the inputs before performing those operations.
+ if (FlipSigns) {
+ EVT EltVT = VT.getVectorElementType();
+ SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
+ Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
+ Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
+ }
+
SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
// If the logical-not of the result is required, perform that now.
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ad4a6c7c41..08d372512d 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -80,6 +80,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/Support/PatternMatch.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -382,7 +383,7 @@ public:
// The starting value of the reduction.
// It does not have to be zero!
- Value *StartValue;
+ TrackingVH<Value> StartValue;
// The instruction who's value is used outside the loop.
Instruction *LoopExitInstr;
// The kind of the reduction.
@@ -427,7 +428,7 @@ public:
/// This flag indicates if we need to add the runtime check.
bool Need;
/// Holds the pointers that we need to check.
- SmallVector<Value*, 2> Pointers;
+ SmallVector<TrackingVH<Value>, 2> Pointers;
/// Holds the pointer value at the beginning of the loop.
SmallVector<const SCEV*, 2> Starts;
/// Holds the pointer value at the end of the loop.
@@ -441,7 +442,7 @@ public:
InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
/// Start value.
- Value *StartValue;
+ TrackingVH<Value> StartValue;
/// Induction kind.
InductionKind IK;
};
@@ -2307,7 +2308,10 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
}
bool LoopVectorizationLegality::canVectorize() {
- assert(TheLoop->getLoopPreheader() && "No preheader!!");
+ // We must have a loop in canonical form. Loops with indirectbr in them cannot
+ // be canonicalized.
+ if (!TheLoop->getLoopPreheader())
+ return false;
// We can only vectorize innermost loops.
if (TheLoop->getSubLoopsVector().size())
@@ -2374,6 +2378,26 @@ bool LoopVectorizationLegality::canVectorize() {
return true;
}
+/// \brief Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+ SmallPtrSet<Value *, 4> &Reductions) {
+ // Reduction instructions are allowed to have exit users. All other
+ // instructions must not have external users.
+ if (!Reductions.count(Inst))
+ //Check that all of the users of the loop are inside the BB.
+ for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
+ I != E; ++I) {
+ Instruction *U = cast<Instruction>(*I);
+ // This user may be a reduction exit value.
+ if (!TheLoop->contains(U)) {
+ DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+ return true;
+ }
+ }
+ return false;
+}
+
bool LoopVectorizationLegality::canVectorizeInstrs() {
BasicBlock *PreHeader = TheLoop->getLoopPreheader();
BasicBlock *Header = TheLoop->getHeader();
@@ -2412,8 +2436,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// If this PHINode is not in the header block, then we know that we
// can convert it to select during if-conversion. No need to check if
// the PHIs in this block are induction or reduction variables.
- if (*bb != Header)
- continue;
+ if (*bb != Header) {
+ // Check that this instruction has no outside users or is an
+ // identified reduction value with an outside user.
+ if(!hasOutsideLoopUser(TheLoop, it, AllowedExit))
+ continue;
+ return false;
+ }
// We only allow if-converted PHIs with more than two incoming values.
if (Phi->getNumIncomingValues() != 2) {
@@ -2506,17 +2535,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
// Reduction instructions are allowed to have exit users.
// All other instructions must not have external users.
- if (!AllowedExit.count(it))
- //Check that all of the users of the loop are inside the BB.
- for (Value::use_iterator I = it->use_begin(), E = it->use_end();
- I != E; ++I) {
- Instruction *U = cast<Instruction>(*I);
- // This user may be a reduction exit value.
- if (!TheLoop->contains(U)) {
- DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
- return false;
- }
- }
+ if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+ return false;
+
} // next instr.
}
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index 74628f0c5c..eb5ad8f0c3 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -599,3 +599,27 @@ for.end179: ; preds = %for.cond.loopexit,
declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
+
+; vmull lowering would create a zext(v4i8 load()) instead of a zextload(v4i8),
+; creating an illegal type during legalization and causing an assert.
+; PR15970
+define void @no_illegal_types_vmull_sext(<4 x i32> %a) {
+entry:
+ %wide.load283.i = load <4 x i8>* undef, align 1
+ %0 = sext <4 x i8> %wide.load283.i to <4 x i32>
+ %1 = sub nsw <4 x i32> %0, %a
+ %2 = mul nsw <4 x i32> %1, %1
+ %predphi290.v.i = select <4 x i1> undef, <4 x i32> undef, <4 x i32> %2
+ store <4 x i32> %predphi290.v.i, <4 x i32>* undef, align 4
+ ret void
+}
+define void @no_illegal_types_vmull_zext(<4 x i32> %a) {
+entry:
+ %wide.load283.i = load <4 x i8>* undef, align 1
+ %0 = zext <4 x i8> %wide.load283.i to <4 x i32>
+ %1 = sub nsw <4 x i32> %0, %a
+ %2 = mul nsw <4 x i32> %1, %1
+ %predphi290.v.i = select <4 x i1> undef, <4 x i32> undef, <4 x i32> %2
+ store <4 x i32> %predphi290.v.i, <4 x i32>* undef, align 4
+ ret void
+}
diff --git a/test/CodeGen/Generic/annotate.ll b/test/CodeGen/Generic/annotate.ll
new file mode 100644
index 0000000000..c617eb0925
--- /dev/null
+++ b/test/CodeGen/Generic/annotate.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s
+
+; PR15253
+
+@.str = private unnamed_addr constant [4 x i8] c"sth\00", section "llvm.metadata"
+@.str1 = private unnamed_addr constant [4 x i8] c"t.c\00", section "llvm.metadata"
+
+
+define i32 @foo(i32 %a) {
+entry:
+ %0 = call i32 @llvm.annotation.i32(i32 %a, i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i32 0, i32 0), i32 2)
+ ret i32 %0
+}
+
+declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1
diff --git a/test/CodeGen/Generic/ptr-annotate.ll b/test/CodeGen/Generic/ptr-annotate.ll
new file mode 100644
index 0000000000..ac5bd5533e
--- /dev/null
+++ b/test/CodeGen/Generic/ptr-annotate.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s
+
+; PR15253
+
+%struct.mystruct = type { i32 }
+
+@.str = private unnamed_addr constant [4 x i8] c"sth\00", section "llvm.metadata"
+@.str1 = private unnamed_addr constant [4 x i8] c"t.c\00", section "llvm.metadata"
+
+define void @foo() {
+entry:
+ %m = alloca i8, align 4
+ %0 = call i8* @llvm.ptr.annotation.p0i8(i8* %m, i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i32 0, i32 0), i32 2)
+ store i8 1, i8* %0, align 4
+ ret void
+}
+
+declare i8* @llvm.ptr.annotation.p0i8(i8*, i8*, i8*, i32) #1
diff --git a/test/CodeGen/NVPTX/generic-to-nvvm.ll b/test/CodeGen/NVPTX/generic-to-nvvm.ll
new file mode 100644
index 0000000000..c9cb2f71f4
--- /dev/null
+++ b/test/CodeGen/NVPTX/generic-to-nvvm.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+; Ensure global variables in address space 0 are promoted to address space 1
+
+; CHECK: .global .align 4 .u32 myglobal = 42;
+@myglobal = internal global i32 42, align 4
+; CHECK: .global .align 4 .u32 myconst = 42;
+@myconst = internal constant i32 42, align 4
+
+
+define void @foo(i32* %a, i32* %b) {
+; CHECK: cvta.global.u32
+ %ld1 = load i32* @myglobal
+; CHECK: cvta.global.u32
+ %ld2 = load i32* @myconst
+ store i32 %ld1, i32* %a
+ store i32 %ld2, i32* %b
+ ret void
+}
+
+
+!nvvm.annotations = !{!0}
+!0 = metadata !{void (i32*, i32*)* @foo, metadata !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/i1-global.ll b/test/CodeGen/NVPTX/i1-global.ll
new file mode 100644
index 0000000000..0595325977
--- /dev/null
+++ b/test/CodeGen/NVPTX/i1-global.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+
+; CHECK: .visible .global .align 1 .u8 mypred
+@mypred = addrspace(1) global i1 true, align 1
+
+
+define void @foo(i1 %p, i32* %out) {
+ %ld = load i1 addrspace(1)* @mypred
+ %val = zext i1 %ld to i32
+ store i32 %val, i32* %out
+ ret void
+}
+
+
+!nvvm.annotations = !{!0}
+!0 = metadata !{void (i1, i32*)* @foo, metadata !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/i1-param.ll b/test/CodeGen/NVPTX/i1-param.ll
new file mode 100644
index 0000000000..fabd61a25d
--- /dev/null
+++ b/test/CodeGen/NVPTX/i1-param.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+; Make sure predicate (i1) operands to kernels get expanded out to .u8
+
+; CHECK: .entry foo
+; CHECK: .param .u8 foo_param_0
+; CHECK: .param .u32 foo_param_1
+define void @foo(i1 %p, i32* %out) {
+ %val = zext i1 %p to i32
+ store i32 %val, i32* %out
+ ret void
+}
+
+
+!nvvm.annotations = !{!0}
+!0 = metadata !{void (i1, i32*)* @foo, metadata !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll
index 8b0357be87..1676f20643 100644
--- a/test/CodeGen/NVPTX/intrinsics.ll
+++ b/test/CodeGen/NVPTX/intrinsics.ll
@@ -15,5 +15,12 @@ define ptx_device double @test_fabs(double %d) {
ret double %x
}
+define float @test_nvvm_sqrt(float %a) {
+ %val = call float @llvm.nvvm.sqrt.f(float %a)
+ ret float %val
+}
+
+
declare float @llvm.fabs.f32(float)
declare double @llvm.fabs.f64(double)
+declare float @llvm.nvvm.sqrt.f(float)
diff --git a/test/CodeGen/NVPTX/refl1.ll b/test/CodeGen/NVPTX/refl1.ll
new file mode 100644
index 0000000000..5a9dac152e
--- /dev/null
+++ b/test/CodeGen/NVPTX/refl1.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s
+
+; Function Attrs: nounwind
+; CHECK: .entry foo
+define void @foo(float* nocapture %a) #0 {
+ %val = load float* %a
+ %tan = tail call fastcc float @__nv_fast_tanf(float %val)
+ store float %tan, float* %a
+ ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.nvvm.sin.approx.ftz.f(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.nvvm.cos.approx.ftz.f(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.nvvm.div.approx.ftz.f(float, float) #1
+
+; Function Attrs: alwaysinline inlinehint nounwind readnone
+; CHECK: .func (.param .b32 func_retval0) __nv_fast_tanf
+define internal fastcc float @__nv_fast_tanf(float %a) #2 {
+entry:
+ %0 = tail call float @llvm.nvvm.sin.approx.ftz.f(float %a)
+ %1 = tail call float @llvm.nvvm.cos.approx.ftz.f(float %a)
+ %2 = tail call float @llvm.nvvm.div.approx.ftz.f(float %0, float %1)
+ ret float %2
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { alwaysinline inlinehint nounwind readnone }
+
+!nvvm.annotations = !{!0}
+
+!0 = metadata !{void (float*)* @foo, metadata !"kernel", i32 1}
diff --git a/test/CodeGen/R600/bfe_uint.ll b/test/CodeGen/R600/bfe_uint.ll
new file mode 100644
index 0000000000..92570c3152
--- /dev/null
+++ b/test/CodeGen/R600/bfe_uint.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @bfe_def
+; CHECK: BFE_UINT
+define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
+entry:
+ %0 = lshr i32 %x, 5
+ %1 = and i32 %0, 15 ; 0xf
+ store i32 %1, i32 addrspace(1)* %out
+ ret void
+}
+
+; This program could be implemented using a BFE_UINT instruction, however
+; since the lshr constant + number of bits in the mask is >= 32, it can also be
+; implmented with a LSHR instruction, which is better, because LSHR has less
+; operands and requires less constants.
+
+; CHECK: @bfe_shift
+; CHECK-NOT: BFE_UINT
+define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
+entry:
+ %0 = lshr i32 %x, 16
+ %1 = and i32 %0, 65535 ; 0xffff
+ store i32 %1, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
new file mode 100644
index 0000000000..7278e90398
--- /dev/null
+++ b/test/CodeGen/R600/mul.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; mul24 and mad24 are affected
+;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+ %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+ %a = load <4 x i32> addrspace(1) * %in
+ %b = load <4 x i32> addrspace(1) * %b_ptr
+ %result = mul <4 x i32> %a, %b
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/R600/r600-encoding.ll b/test/CodeGen/R600/r600-encoding.ll
new file mode 100644
index 0000000000..c8040a1b4c
--- /dev/null
+++ b/test/CodeGen/R600/r600-encoding.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rs880 | FileCheck --check-prefix=R600-CHECK %s
+
+; The earliest R600 GPUs have a slightly different encoding than the rest of
+; the VLIW4/5 GPUs.
+
+; EG-CHECK: @test
+; EG-CHECK: MUL_IEEE {{[ *TXYZW.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}]
+
+; R600-CHECK: @test
+; R600-CHECK: MUL_IEEE {{[ *TXYZW.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
+
+define void @test() {
+entry:
+ %0 = call float @llvm.R600.load.input(i32 0)
+ %1 = call float @llvm.R600.load.input(i32 1)
+ %2 = fmul float %0, %1
+ call void @llvm.AMDGPU.store.output(float %2, i32 0)
+ ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll
new file mode 100644
index 0000000000..972542d346
--- /dev/null
+++ b/test/CodeGen/R600/sra.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @ashr_v4i32
+; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+ %result = ashr <4 x i32> %a, %b
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
new file mode 100644
index 0000000000..12bfba3975
--- /dev/null
+++ b/test/CodeGen/R600/sub.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: SUB_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+ %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+ %a = load <4 x i32> addrspace(1) * %in
+ %b = load <4 x i32> addrspace(1) * %b_ptr
+ %result = sub <4 x i32> %a, %b
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/R600/vselect.ll b/test/CodeGen/R600/vselect.ll
new file mode 100644
index 0000000000..6e459df847
--- /dev/null
+++ b/test/CodeGen/R600/vselect.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @test_select_v4i32
+; CHECK: CNDE_INT T{{[0-9]+\.[XYZW], PV\.[xyzw], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: CNDE_INT * T{{[0-9]+\.[XYZW], PV\.[xyzw], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: CNDE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: CNDE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+entry:
+ %0 = load <4 x i32> addrspace(1)* %in0
+ %1 = load <4 x i32> addrspace(1)* %in1
+ %cmp = icmp ne <4 x i32> %0, %1
+ %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1
+ store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+ ret void
+}
diff --git a/test/CodeGen/X86/compact-unwind.ll b/test/CodeGen/X86/compact-unwind.ll
new file mode 100644
index 0000000000..8c4fa27da5
--- /dev/null
+++ b/test/CodeGen/X86/compact-unwind.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -disable-cfi -disable-fp-elim -mtriple x86_64-apple-darwin11 | FileCheck %s
+
+%ty = type { i8* }
+
+@gv = external global i32
+
+; This is aligning the stack with a push of a random register.
+; CHECK: pushq %rax
+
+; Even though we can't encode %rax into the compact unwind, We still want to be
+; able to generate a compact unwind encoding in this particular case.
+;
+; CHECK: __LD,__compact_unwind
+; CHECK: _foo ## Range Start
+; CHECK: 16842753 ## Compact Unwind Encoding: 0x1010001
+
+define i8* @foo(i64 %size) {
+ %addr = alloca i64, align 8
+ %tmp20 = load i32* @gv, align 4
+ %tmp21 = call i32 @bar()
+ %tmp25 = load i64* %addr, align 8
+ %tmp26 = inttoptr i64 %tmp25 to %ty*
+ %tmp29 = getelementptr inbounds %ty* %tmp26, i64 0, i32 0
+ %tmp34 = load i8** %tmp29, align 8
+ %tmp35 = getelementptr inbounds i8* %tmp34, i64 %size
+ store i8* %tmp35, i8** %tmp29, align 8
+ ret i8* null
+}
+
+declare i32 @bar()
diff --git a/test/CodeGen/X86/vec_compare.ll b/test/CodeGen/X86/vec_compare.ll
index 85d8b2cea3..fd5c234bb1 100644
--- a/test/CodeGen/X86/vec_compare.ll
+++ b/test/CodeGen/X86/vec_compare.ll
@@ -67,7 +67,15 @@ define <2 x i64> @test6(<2 x i64> %A, <2 x i64> %B) nounwind {
}
define <2 x i64> @test7(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK: [[CONSTSEG:[A-Z0-9_]*]]:
+; CHECK: .long 2147483648
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .long 2147483648
+; CHECK-NEXT: .long 0
; CHECK: test7:
+; CHECK: movdqa [[CONSTSEG]], [[CONSTREG:%xmm[0-9]*]]
+; CHECK: pxor [[CONSTREG]]
+; CHECK: pxor [[CONSTREG]]
; CHECK: pcmpgtd %xmm1
; CHECK: pshufd $-96
; CHECK: pcmpeqd
@@ -83,6 +91,8 @@ define <2 x i64> @test7(<2 x i64> %A, <2 x i64> %B) nounwind {
define <2 x i64> @test8(<2 x i64> %A, <2 x i64> %B) nounwind {
; CHECK: test8:
+; CHECK: pxor
+; CHECK: pxor
; CHECK: pcmpgtd %xmm0
; CHECK: pshufd $-96
; CHECK: pcmpeqd
@@ -98,6 +108,8 @@ define <2 x i64> @test8(<2 x i64> %A, <2 x i64> %B) nounwind {
define <2 x i64> @test9(<2 x i64> %A, <2 x i64> %B) nounwind {
; CHECK: test9:
+; CHECK: pxor
+; CHECK: pxor
; CHECK: pcmpgtd %xmm0
; CHECK: pshufd $-96
; CHECK: pcmpeqd
@@ -115,6 +127,8 @@ define <2 x i64> @test9(<2 x i64> %A, <2 x i64> %B) nounwind {
define <2 x i64> @test10(<2 x i64> %A, <2 x i64> %B) nounwind {
; CHECK: test10:
+; CHECK: pxor
+; CHECK: pxor
; CHECK: pcmpgtd %xmm1
; CHECK: pshufd $-96
; CHECK: pcmpeqd
@@ -131,9 +145,15 @@ define <2 x i64> @test10(<2 x i64> %A, <2 x i64> %B) nounwind {
}
define <2 x i64> @test11(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK: [[CONSTSEG:[A-Z0-9_]*]]:
+; CHECK: .long 2147483648
+; CHECK-NEXT: .long 2147483648
+; CHECK-NEXT: .long 2147483648
+; CHECK-NEXT: .long 2147483648
; CHECK: test11:
-; CHECK: pxor
-; CHECK: pxor
+; CHECK: movdqa [[CONSTSEG]], [[CONSTREG:%xmm[0-9]*]]
+; CHECK: pxor [[CONSTREG]]
+; CHECK: pxor [[CONSTREG]]
; CHECK: pcmpgtd %xmm1
; CHECK: pshufd $-96
; CHECK: pcmpeqd
diff --git a/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
index e7af892c10..620478a879 100644
--- a/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
+++ b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
@@ -1,5 +1,6 @@
; RUN: llc -O0 %s -mtriple=x86_64-apple-darwin -filetype=obj -o %t
; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-apple-macosx10.7 | FileCheck %s -check-prefix=ASM
; rdar://13067005
; CHECK: .debug_info contents:
@@ -20,6 +21,11 @@
; CHECK: file_names[ 1] 0 0x00000000 0x00000000 simple2.c
; CHECK-NOT: file_names
+; PR15408
+; ASM: L__DWARF__debug_info_begin0:
+; ASM: .long 0 ## DW_AT_stmt_list
+; ASM: L__DWARF__debug_info_begin1:
+; ASM: .long 0 ## DW_AT_stmt_list
define i32 @test(i32 %a) nounwind uwtable ssp {
entry:
%a.addr = alloca i32, align 4
diff --git a/test/ExecutionEngine/MCJIT/eh.ll b/test/ExecutionEngine/MCJIT/eh.ll
index 0c19b1bf2e..c2135736ad 100644
--- a/test/ExecutionEngine/MCJIT/eh.ll
+++ b/test/ExecutionEngine/MCJIT/eh.ll
@@ -1,5 +1,5 @@
; RUN: %lli_mcjit %s
-; XFAIL: arm, cygwin
+; XFAIL: arm, cygwin, win32, mingw
declare i8* @__cxa_allocate_exception(i64)
declare void @__cxa_throw(i8*, i8*, i8*)
declare i32 @__gxx_personality_v0(...)
diff --git a/test/MC/AsmParser/exprs.s b/test/MC/AsmParser/exprs.s
index df075f85ec..a7e10020b6 100644
--- a/test/MC/AsmParser/exprs.s
+++ b/test/MC/AsmParser/exprs.s
@@ -45,6 +45,7 @@ k:
check_expr 0 || 0, 0
check_expr 1 + 2 < 3 + 4, 1
check_expr 1 << 8 - 1, 128
+ check_expr 3 * 9 - 2 * 9 + 1, 10
.set c, 10
check_expr c + 1, 11
diff --git a/test/Transforms/LoopUnroll/scevunroll.ll b/test/Transforms/LoopUnroll/scevunroll.ll
index 99b3a7d861..308a036316 100644
--- a/test/Transforms/LoopUnroll/scevunroll.ll
+++ b/test/Transforms/LoopUnroll/scevunroll.ll
@@ -66,13 +66,16 @@ exit2:
; SCEV properly unrolls multi-exit loops.
;
+; SCEV cannot currently unroll this loop.
+; It should ideally detect a trip count of 5.
+; rdar:14038809 [SCEV]: Optimize trip count computation for multi-exit loops.
; CHECK: @multiExit
-; CHECK: getelementptr i32* %base, i32 10
-; CHECK-NEXT: load i32*
-; CHECK: br i1 false, label %l2.10, label %exit1
-; CHECK: l2.10:
-; CHECK-NOT: br
-; CHECK: ret i32
+; CHECKFIXME: getelementptr i32* %base, i32 10
+; CHECKFIXME-NEXT: load i32*
+; CHECKFIXME: br i1 false, label %l2.10, label %exit1
+; CHECKFIXME: l2.10:
+; CHECKFIXME-NOT: br
+; CHECKFIXME: ret i32
define i32 @multiExit(i32* %base) nounwind {
entry:
br label %l1
@@ -170,3 +173,38 @@ for.body87:
br label %for.body87
}
+; PR16130: clang produces incorrect code with loop/expression at -O2
+; rdar:14036816 loop-unroll makes assumptions about undefined behavior
+;
+; The loop latch is assumed to exit after the first iteration because
+; of the induction variable's NSW flag. However, the loop latch's
+; equality test is skipped and the loop exits after the second
+; iteration via the early exit. So loop unrolling cannot assume that
+; the loop latch's exit count of zero is an upper bound on the number
+; of iterations.
+;
+; CHECK: @nsw_latch
+; CHECK: for.body:
+; CHECK: %b.03 = phi i32 [ 0, %entry ], [ %add, %for.cond ]
+; CHECK: return:
+; CHECK: %b.03.lcssa = phi i32 [ %b.03, %for.body ], [ %b.03, %for.cond ]
+define void @nsw_latch(i32* %a) nounwind {
+entry:
+ br label %for.body
+
+for.body: ; preds = %for.cond, %entry
+ %b.03 = phi i32 [ 0, %entry ], [ %add, %for.cond ]
+ %tobool = icmp eq i32 %b.03, 0
+ %add = add nsw i32 %b.03, 8
+ br i1 %tobool, label %for.cond, label %return
+
+for.cond: ; preds = %for.body
+ %cmp = icmp eq i32 %add, 13
+ br i1 %cmp, label %return, label %for.body
+
+return: ; preds = %for.body, %for.cond
+ %b.03.lcssa = phi i32 [ %b.03, %for.body ], [ %b.03, %for.cond ]
+ %retval.0 = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+ store i32 %b.03.lcssa, i32* %a, align 4
+ ret void
+}
diff --git a/test/Transforms/LoopUnroll/unloop.ll b/test/Transforms/LoopUnroll/unloop.ll
index 5a9cacda44..9a938cc287 100644
--- a/test/Transforms/LoopUnroll/unloop.ll
+++ b/test/Transforms/LoopUnroll/unloop.ll
@@ -21,8 +21,8 @@ outer:
inner:
%iv = phi i32 [ 0, %outer ], [ %inc, %tail ]
%inc = add i32 %iv, 1
- %wbucond = call zeroext i1 @check()
- br i1 %wbucond, label %outer.backedge, label %tail
+ call zeroext i1 @check()
+ br i1 true, label %outer.backedge, label %tail
tail:
br i1 false, label %inner, label %exit
@@ -126,25 +126,27 @@ return:
; Ensure that only the middle loop is removed and rely on verify-loopinfo to
; check soundness.
;
-; CHECK: @unloopDeepNested
+; This test must be disabled until trip count computation can be optimized...
+; rdar:14038809 [SCEV]: Optimize trip count computation for multi-exit loops.
+; CHECKFIXME: @unloopDeepNested
; Inner-inner loop control.
-; CHECK: while.cond.us.i:
-; CHECK: br i1 %cmp.us.i, label %next_data.exit, label %while.body.us.i
-; CHECK: if.then.us.i:
-; CHECK: br label %while.cond.us.i
+; CHECKFIXME: while.cond.us.i:
+; CHECKFIXME: br i1 %cmp.us.i, label %next_data.exit, label %while.body.us.i
+; CHECKFIXME: if.then.us.i:
+; CHECKFIXME: br label %while.cond.us.i
; Inner loop tail.
-; CHECK: if.else.i:
-; CHECK: br label %while.cond.outer.i
+; CHECKFIXME: if.else.i:
+; CHECKFIXME: br label %while.cond.outer.i
; Middle loop control (removed).
-; CHECK: valid_data.exit:
-; CHECK-NOT: br
-; CHECK: %cmp = call zeroext i1 @check()
+; CHECKFIXME: valid_data.exit:
+; CHECKFIXME-NOT: br
+; CHECKFIXME: %cmp = call zeroext i1 @check()
; Outer loop control.
-; CHECK: copy_data.exit:
-; CHECK: br i1 %cmp38, label %if.then39, label %while.cond.outer
+; CHECKFIXME: copy_data.exit:
+; CHECKFIXME: br i1 %cmp38, label %if.then39, label %while.cond.outer
; Outer-outer loop tail.
-; CHECK: while.cond.outer.outer.backedge:
-; CHECK: br label %while.cond.outer.outer
+; CHECKFIXME: while.cond.outer.outer.backedge:
+; CHECKFIXME: br label %while.cond.outer.outer
define void @unloopDeepNested() nounwind {
for.cond8.preheader.i:
%cmp113.i = call zeroext i1 @check()
diff --git a/test/Transforms/LoopVectorize/lcssa-crash.ll b/test/Transforms/LoopVectorize/lcssa-crash.ll
index 06b3b08aa0..de6be54849 100644
--- a/test/Transforms/LoopVectorize/lcssa-crash.ll
+++ b/test/Transforms/LoopVectorize/lcssa-crash.ll
@@ -27,3 +27,14 @@ for.end.i.i.i:
unreachable
}
+; PR16139
+define void @test2(i8* %x) {
+entry:
+ indirectbr i8* %x, [ label %L0, label %L1 ]
+
+L0:
+ br label %L0
+
+L1:
+ ret void
+}
diff --git a/test/Transforms/LoopVectorize/no_outside_user.ll b/test/Transforms/LoopVectorize/no_outside_user.ll
new file mode 100644
index 0000000000..6f0357c5e5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/no_outside_user.ll
@@ -0,0 +1,41 @@
+; RUN: opt -S -loop-vectorize -force-vector-unroll=1 -force-vector-width=2 < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+
+@f = common global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+@c = common global i32 0, align 4
+@a = common global i32 0, align 4
+@b = common global i32 0, align 4
+@e = common global i32 0, align 4
+
+; We used to vectorize this loop. But it has a value that is used outside of the
+; and is not a recognized reduction variable "tmp17".
+
+; CHECK-NOT: <2 x i32>
+
+define i32 @main() {
+bb:
+ %b.promoted = load i32* @b, align 4
+ br label %.lr.ph.i
+
+.lr.ph.i:
+ %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+ %tmp2 = icmp sgt i32 %tmp8, 10
+ br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+ br label %bb16
+
+bb16:
+ %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ]
+ %tmp18 = add nsw i32 %tmp8, 1
+ %tmp19 = icmp slt i32 %tmp18, 4
+ br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+ %.lcssa = phi i32 [ %tmp17, %bb16 ]
+ ret i32 %.lcssa
+}
+
+
diff --git a/test/Transforms/LoopVectorize/value-ptr-bug.ll b/test/Transforms/LoopVectorize/value-ptr-bug.ll
new file mode 100644
index 0000000000..f376656f07
--- /dev/null
+++ b/test/Transforms/LoopVectorize/value-ptr-bug.ll
@@ -0,0 +1,50 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -dce -instcombine < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; PR16073
+
+; Because we were caching value pointers accross a function call that could RAUW
+; we would generate an undefined value store below:
+; SCEVExpander::expandCodeFor would change a value (the start value of an
+; induction) that we cached in the induction variable list.
+
+; CHECK: test_vh
+; CHECK-NOT: store <4 x i8> undef
+
+define void @test_vh(i32* %ptr265, i32* %ptr266, i32 %sub267) {
+entry:
+ br label %loop
+
+loop:
+ %inc = phi i32 [ %sub267, %entry ], [ %add, %loop]
+ %ext.inc = sext i32 %inc to i64
+ %add.ptr265 = getelementptr inbounds i32* %ptr265, i64 %ext.inc
+ %add.ptr266 = getelementptr inbounds i32* %ptr266, i64 %ext.inc
+ %add = add i32 %inc, 9
+ %cmp = icmp slt i32 %add, 140
+ br i1 %cmp, label %block1, label %loop
+
+block1:
+ %sub267.lcssa = phi i32 [ %add, %loop ]
+ %add.ptr266.lcssa = phi i32* [ %add.ptr266, %loop ]
+ %add.ptr265.lcssa = phi i32* [ %add.ptr265, %loop ]
+ %tmp29 = bitcast i32* %add.ptr265.lcssa to i8*
+ %tmp30 = bitcast i32* %add.ptr266.lcssa to i8*
+ br label %do.body272
+
+do.body272:
+ %row_width.5 = phi i32 [ %sub267.lcssa, %block1 ], [ %dec, %do.body272 ]
+ %sp.4 = phi i8* [ %tmp30, %block1 ], [ %incdec.ptr273, %do.body272 ]
+ %dp.addr.4 = phi i8* [ %tmp29, %block1 ], [ %incdec.ptr274, %do.body272 ]
+ %incdec.ptr273 = getelementptr inbounds i8* %sp.4, i64 1
+ %tmp31 = load i8* %sp.4, align 1
+ %incdec.ptr274 = getelementptr inbounds i8* %dp.addr.4, i64 1
+ store i8 %tmp31, i8* %dp.addr.4, align 1
+ %dec = add i32 %row_width.5, -1
+ %cmp276 = icmp eq i32 %dec, 0
+ br i1 %cmp276, label %loop.exit, label %do.body272
+
+loop.exit:
+ ret void
+}