Merge branch 'release_33' of git://github.com/llvm-mirror/llvm into embtk-support-release-3.3

author: Abdoulaye Walsimou Gaye <awg@embtoolkit.org> 2013-06-12 21:27:20 +0200
committer: Abdoulaye Walsimou Gaye <awg@embtoolkit.org> 2013-06-12 21:27:20 +0200
commit: 46ef71a74329a0777e3464c65927cd3e59c928d6 (patch)
tree: e8c06aeb70dcbe18acdf6db1f88a1452ac1dca68
parent: 32c493313c30abe0830e95dc92c7ad1c2241ba57 (diff)
parent: ce337502f5a88500df9ab2f59ab48f97be0b4395 (diff)
download: llvm-46ef71a74329a0777e3464c65927cd3e59c928d6.tar.gz
llvm-46ef71a74329a0777e3464c65927cd3e59c928d6.tar.bz2
llvm-46ef71a74329a0777e3464c65927cd3e59c928d6.tar.xz
66 files changed, 1554 insertions, 1540 deletions
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index c1efd31a62..ffd155d2e2 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -31,7 +31,7 @@ dnl===
 dnl===-----------------------------------------------------------------------===
 dnl Initialize autoconf and define the package name, version number and
 dnl address for reporting bugs.
-AC_INIT([LLVM],[3.3svn],[http://llvm.org/bugs/])
+AC_INIT([LLVM],[3.3],[http://llvm.org/bugs/])
 AC_DEFINE([LLVM_VERSION_MAJOR], [3], [Major version of the LLVM API])
 AC_DEFINE([LLVM_VERSION_MINOR], [3], [Minor version of the LLVM API])
 
diff --git a/configure b/configure
index 2452f820ab..3045c3260f 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
 #! /bin/sh
 # Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.60 for LLVM 3.3svn.
+# Generated by GNU Autoconf 2.60 for LLVM 3.3.
 #
 # Report bugs to <http://llvm.org/bugs/>.
 #
@@ -561,8 +561,8 @@ SHELL=${CONFIG_SHELL-/bin/sh}
 # Identity of this package.
 PACKAGE_NAME='LLVM'
 PACKAGE_TARNAME='llvm'
-PACKAGE_VERSION='3.3svn'
-PACKAGE_STRING='LLVM 3.3svn'
+PACKAGE_VERSION='3.3'
+PACKAGE_STRING='LLVM 3.3'
 PACKAGE_BUGREPORT='http://llvm.org/bugs/'
 
 ac_unique_file="lib/IR/Module.cpp"
@@ -1331,7 +1331,7 @@ if test "$ac_init_help" = "long"; then
   # Omit some internal or obsolete options to make the list less imposing.
   # This message is too long to be a string in the A/UX 3.1 sh.
   cat <<_ACEOF
-\`configure' configures LLVM 3.3svn to adapt to many kinds of systems.
+\`configure' configures LLVM 3.3 to adapt to many kinds of systems.
 
 Usage: $0 [OPTION]... [VAR=VALUE]...
 
@@ -1397,7 +1397,7 @@ fi
 
 if test -n "$ac_init_help"; then
   case $ac_init_help in
-     short | recursive ) echo "Configuration of LLVM 3.3svn:";;
+     short | recursive ) echo "Configuration of LLVM 3.3:";;
    esac
   cat <<\_ACEOF
 
@@ -1569,7 +1569,7 @@ fi
 test -n "$ac_init_help" && exit $ac_status
 if $ac_init_version; then
   cat <<\_ACEOF
-LLVM configure 3.3svn
+LLVM configure 3.3
 generated by GNU Autoconf 2.60
 
 Copyright (C) 1992, 1993, 1994, 1995, 1996, 1998, 1999, 2000, 2001,
@@ -1585,7 +1585,7 @@ cat >config.log <<_ACEOF
 This file contains any messages produced by compilers while
 running configure, to aid debugging if configure makes a mistake.
 
-It was created by LLVM $as_me 3.3svn, which was
+It was created by LLVM $as_me 3.3, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   $ $0 $@
@@ -23128,7 +23128,7 @@ exec 6>&1
 # report actual input values of CONFIG_FILES etc. instead of their
 # values after options handling.
 ac_log="
-This file was extended by LLVM $as_me 3.3svn, which was
+This file was extended by LLVM $as_me 3.3, which was
 generated by GNU Autoconf 2.60.  Invocation command line was
 
   CONFIG_FILES    = $CONFIG_FILES
@@ -23181,7 +23181,7 @@ Report bugs to <bug-autoconf@gnu.org>."
 _ACEOF
 cat >>$CONFIG_STATUS <<_ACEOF
 ac_cs_version="\\
-LLVM config.status 3.3svn
+LLVM config.status 3.3
 configured by $0, generated by GNU Autoconf 2.60,
   with options \\"`echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`\\"
 
diff --git a/docs/CommandLine.rst b/docs/CommandLine.rst
index 263a025f69..9b77a98908 100644
--- a/docs/CommandLine.rst
+++ b/docs/CommandLine.rst
@@ -618,6 +618,8 @@ would yield the help output:
     -help             - display available options (-help-hidden for more)
     -o <filename>     - Specify output filename
 
+.. _grouping options into categories:
+
 Grouping options into categories
 --------------------------------
 
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 410f640776..7743ff06a0 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -2868,11 +2868,10 @@ All globals of this sort should have a section specified as
 The '``llvm.used``' Global Variable
 -----------------------------------
 
-The ``@llvm.used`` global is an array which has
- :ref:`appending linkage <linkage_appending>`. This array contains a list of
-pointers to global variables, functions and aliases which may optionally have a
-pointer cast formed of bitcast or getelementptr. For example, a legal
-use of it is:
+The ``@llvm.used`` global is an array which has :ref:`appending linkage
+<linkage_appending>`. This array contains a list of pointers to global
+variables, functions and aliases which may optionally have a pointer cast formed
+of bitcast or getelementptr. For example, a legal use of it is:
 
 .. code-block:: llvm
 
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index 7952cd5423..73b0abf628 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -5,12 +5,6 @@ LLVM 3.3 Release Notes
 .. contents::
     :local:
 
-.. warning::
-   These are in-progress notes for the upcoming LLVM 3.3 release.  You may
-   prefer the `LLVM 3.2 Release Notes <http://llvm.org/releases/3.2/docs
-   /ReleaseNotes.html>`_.
-
-
 Introduction
 ============
 
@@ -34,13 +28,6 @@ page <http://llvm.org/releases/>`_.
 Non-comprehensive list of changes in this release
 =================================================
 
-.. NOTE
-   For small 1-3 sentence descriptions, just add an entry at the end of
-   this list. If your description won't fit comfortably in one bullet
-   point (e.g. maybe you would like to give an example of the
-   functionality, or simply have a lot to talk about), see the `NOTE` below
-   for adding a new subsection.
-
 * The CellSPU port has been removed.  It can still be found in older versions.
 
 * The IR-level extended linker APIs (for example, to link bitcode files out of
@@ -70,17 +57,15 @@ Non-comprehensive list of changes in this release
   examples of the new syntax. The old syntax using register classes still
   works, but it will be removed in a future LLVM release.
 
-* ... next change ...
-
-.. NOTE
-   If you would like to document a larger change, then you can add a
-   subsection about it right here. You can copy the following boilerplate
-   and un-indent it (the indentation causes it to be inside this comment).
+* MCJIT now supports exception handling. Support for it in the old jit will be
+  removed in the 3.4 release.
 
-   Special New Feature
-   -------------------
+* Command line options can now be grouped into categories which are shown in
+  the output of ``-help``. See :ref:`grouping options into categories`.
 
-   Makes programs 10x faster by doing Special New Thing.
+* The appearance of command line options in ``-help`` that are inherited by
+  linking with libraries that use the LLVM Command line support library can now
+  be modified at runtime. See :ref:`cl::getRegisteredOptions`.
 
 AArch64 target
 --------------
@@ -99,9 +84,59 @@ GNU-style thread local storage and inline assembly.
 Hexagon Target
 --------------
 
-- Removed support for legacy hexagonv2 and hexagonv3 processor
-  architectures which are no longer in use. Currently supported
-  architectures are hexagonv4 and hexagonv5.
+Removed support for legacy hexagonv2 and hexagonv3 processor architectures which
+are no longer in use. Currently supported architectures are hexagonv4 and
+hexagonv5.
+
+Mips target
+--------------
+
+New features and improvements:
+
+- Clang driver
+ - Support for Sourcery CodeBench Mips toolchain directories tree.
+ - Support for new command line options including:
+  - -mxgot/-mno-xgot
+  - -EL / -EB
+  - -mmicromips / -mno-micromips
+  - -msingle-float / -mdouble-float
+  - -mabi=32 (o32 abi) and -mabi=64 (n64 abi)
+ - Previously, options such as -mips16, -mmicromips, -mdsp and -mdspr2 were
+   not passed to the assembler. This issue has been fixed.
+
+- A number of changes have been made to improve the quality of DSP-ASE code
+  generation.
+ - Multiply and multiply-accumulate instructions can now use all four
+   accumulators.
+ - Instruction selection patterns have been added so that DSP instructions
+   are emitted without having to use builtins.
+
+- Delay slot filler pass can now search successor blocks for instructions to
+  fill delay slots (use option -disable-mips-df-succbb-search=false).
+
+PowerPC Target
+--------------
+
+New features and improvements:
+
+- PowerPC now supports an assembly parser.
+- Support added for thread-local storage.  64-bit ELF subtarget only.
+- Support added for medium and large code model (-mcmodel=medium,large).
+  Medium code model is now the default.  64-bit ELF subtarget only.
+- Improved register allocation (fewer reserved registers).
+- 64-bit atomic load and store are now supported.
+- Improved code generation for unaligned memory accesses of scalar types.
+- Improved performance of floating-point divide and square root
+  with -ffast-math.
+- Support for predicated returns.
+- Improved code generation for comparisons.
+- Support added for inline setjmp and longjmp.
+- Support added for many instructions introduced in PowerISA 2.04, 2.05,
+  and 2.06.
+- Improved spill code for vector registers.
+- Support added for -mno-altivec.
+- ABI compatibility fixes for complex parameters, 128-bit integer parameters,
+  and varargs functions.  64-bit ELF subtarget only.
 
 Loop Vectorizer
 ---------------
@@ -126,16 +161,16 @@ SLP Vectorizer
 --------------
 
 LLVM now has a new SLP vectorizer. The new SLP vectorizer is not enabled by
-default but can be enabled using the clang flag -fslp-vectorize. The BB-vectorizer
-can also be enabled using the command line flag -fslp-vectorize-aggressive.
+default but can be enabled using the clang flag ``-fslp-vectorize``. The
+BB-vectorizer can also be enabled using the command line flag
+``-fslp-vectorize-aggressive``.
 
 R600 Backend
 ------------
 
-The R600 backend was added in this release, it supports AMD GPUs
-(HD2XXX - HD7XXX).  This backend is used in AMD's Open Source
-graphics / compute drivers which are developed as part of the `Mesa3D
-<http://www.mesa3d.org>`_ project.
+The R600 backend was added in this release, it supports AMD GPUs (HD2XXX -
+HD7XXX).  This backend is used in AMD's Open Source graphics / compute drivers
+which are developed as part of the `Mesa3D <http://www.mesa3d.org>`_ project.
 
 SystemZ/s390x Backend
 ---------------------
@@ -145,41 +180,130 @@ is restricted to GNU/Linux (GNU triplet s390x-linux-gnu) and requires
 z10 or greater.
 
 
+Sub-project Status Update
+=========================
+
+In addition to the core LLVM 3.3 distribution of production-quality compiler
+infrastructure, the LLVM project includes sub-projects that use the LLVM core
+and share the same distribution license.  This section provides updates on these
+sub-projects.
+
+
+DragonEgg: GCC front-ends, LLVM back-end
+----------------------------------------
+
+`DragonEgg <http://dragonegg.llvm.org/>`_ is a
+`GCC plugin <http://gcc.gnu.org/wiki/plugins>`_ that replaces GCC's optimizers
+and code generators with LLVM's.  It works with gcc-4.5, 4.6, 4.7 and 4.8, can
+target the x86-32/x86-64 and ARM processor families, and has been successfully
+used on the Darwin, FreeBSD, KFreeBSD, Linux and OpenBSD platforms.  It fully
+supports Ada, C, C++ and Fortran.  It has partial support for Go, Java, Obj-C
+and Obj-C++.  Note that gcc-4.6 is the best supported version, and that Ada in
+particular doesn't work well with gcc-4.7 and newer.
+
+The `3.3 release <http://llvm.org/apt/>`_ has the following notable changes.
+
+- supports gcc-4.8 (requires gcc-4.8.1 or newer)
+- object files can be written directly using LLVM's integrated assembler
+- produces saner debug info
+- bitfields can now contain arbitrary scalar types (useful for Ada)
+
+
+LLDB: Low Level Debugger
+------------------------
+
+`LLDB <http://lldb.llvm.org/>`_ is a ground-up implementation of a command-line
+debugger, as well as a debugger API that can be used from scripts and other
+applications. LLDB uses the following components of the LLVM core distribution
+to support the latest language features and target support:
+
+- the Clang parser for high-quality parsing of C, C++ and Objective C
+- the LLVM disassembler
+- the LLVM JIT compiler (MCJIT) for expression evaluation
+
+The `3.3 release <http://lldb.llvm.org/download.html>`_ has the following notable changes.
+
+Features now supported on Linux:
+
+- Debugging multi-threaded programs
+- Support for watchpoints
+- Process list, attach and fork
+- `vim integration <http://llvm.org/svn/llvm-project/lldb/branches/release_33/utils/vim-lldb/README>`_ for LLDB
+
+Portability:
+
+- Builds with cmake, ninja, auto-tools, clang 3.3 and gcc 4.6
+
+Linux Improvements:
+
+- Improved register support including vector registers
+- Basic debugging of i386 programs
+- Bug fixes for expression evaluation
+
+
 External Open Source Projects Using LLVM 3.3
 ============================================
 
-An exciting aspect of LLVM is that it is used as an enabling technology for
-a lot of other language and tools projects. This section lists some of the
+An exciting aspect of LLVM is that it is used as an enabling technology for a
+lot of other language and tools projects. This section lists some of the
 projects that have already been updated to work with LLVM 3.3.
 
 
 Portable Computing Language (pocl)
 ----------------------------------
 
-In addition to producing an easily portable open source OpenCL
-implementation, another major goal of `pocl <http://pocl.sourceforge.net/>`_ 
-is improving performance portability of OpenCL programs with
-compiler optimizations, reducing the need for target-dependent manual
-optimizations. An important part of pocl is a set of LLVM passes used to
-statically parallelize multiple work-items with the kernel compiler, even in
-the presence of work-group barriers. This enables static parallelization of
-the fine-grained static concurrency in the work groups in multiple ways.
+In addition to producing an easily portable open source OpenCL implementation,
+another major goal of `pocl <http://pocl.sourceforge.net/>`_ is improving
+performance portability of OpenCL programs with compiler optimizations, reducing
+the need for target-dependent manual optimizations. An important part of pocl is
+a set of LLVM passes used to statically parallelize multiple work-items with the
+kernel compiler, even in the presence of work-group barriers. This enables
+static parallelization of the fine-grained static concurrency in the work groups
+in multiple ways.
 
 TTA-based Co-design Environment (TCE)
 -------------------------------------
 
-`TCE <http://tce.cs.tut.fi/>`_ is a toolset for designing new 
-processors based on the Transport triggered architecture (TTA). 
-The toolset provides a complete co-design flow from C/C++
-programs down to synthesizable VHDL/Verilog and parallel program binaries.
-Processor customization points include the register files, function units,
-supported operations, and the interconnection network.
+`TCE <http://tce.cs.tut.fi/>`_ is a toolset for designing new processors based
+on the Transport triggered architecture (TTA).  The toolset provides a complete
+co-design flow from C/C++ programs down to synthesizable VHDL/Verilog and
+parallel program binaries.  Processor customization points include the register
+files, function units, supported operations, and the interconnection network.
 
 TCE uses Clang and LLVM for C/C++/OpenCL C language support, target independent
-optimizations and also for parts of code generation. It generates new
-LLVM-based code generators "on the fly" for the designed TTA processors and
-loads them in to the compiler backend as runtime libraries to avoid
-per-target recompilation of larger parts of the compiler chain.
+optimizations and also for parts of code generation. It generates new LLVM-based
+code generators "on the fly" for the designed TTA processors and loads them in
+to the compiler backend as runtime libraries to avoid per-target recompilation
+of larger parts of the compiler chain.
+
+Just-in-time Adaptive Decoder Engine (Jade)
+-------------------------------------------
+
+`Jade <https://github.com/orcc/jade>`_ (Just-in-time Adaptive Decoder Engine) is
+a generic video decoder engine using LLVM for just-in-time compilation of video
+decoder configurations. Those configurations are designed by MPEG Reconfigurable
+Video Coding (RVC) committee. MPEG RVC standard is built on a stream-based
+dataflow representation of decoders. It is composed of a standard library of
+coding tools written in RVC-CAL language and a dataflow configuration --- block
+diagram --- of a decoder.
+
+Jade project is hosted as part of the Open RVC-CAL Compiler (`Orcc
+<http://orcc.sf.net>`_) and requires it to translate the RVC-CAL standard
+library of video coding tools into an LLVM assembly code.
+
+LDC - the LLVM-based D compiler
+-------------------------------
+
+`D <http://dlang.org>`_ is a language with C-like syntax and static typing. It
+pragmatically combines efficiency, control, and modeling power, with safety and
+programmer productivity. D supports powerful concepts like Compile-Time Function
+Execution (CTFE) and Template Meta-Programming, provides an innovative approach
+to concurrency and offers many classical paradigms.
+
+`LDC <http://wiki.dlang.org/LDC>`_ uses the frontend from the reference compiler
+combined with LLVM as backend to produce efficient native code. LDC targets
+x86/x86_64 systems like Linux, OS X and Windows and also Linux/PPC64. Ports to
+other architectures like ARM are underway.
 
 
 Additional Information
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 306549fba4..349447fbbb 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -453,7 +453,8 @@ namespace llvm {
     ExitLimit ComputeExitLimitFromCond(const Loop *L,
                                        Value *ExitCond,
                                        BasicBlock *TBB,
-                                       BasicBlock *FBB);
+                                       BasicBlock *FBB,
+                                       bool IsSubExpr);
 
     /// ComputeExitLimitFromICmp - Compute the number of times the backedge of
     /// the specified loop will execute if its exit condition were a conditional
@@ -461,7 +462,8 @@ namespace llvm {
     ExitLimit ComputeExitLimitFromICmp(const Loop *L,
                                        ICmpInst *ExitCond,
                                        BasicBlock *TBB,
-                                       BasicBlock *FBB);
+                                       BasicBlock *FBB,
+                                       bool IsSubExpr);
 
     /// ComputeLoadConstantCompareExitLimit - Given an exit condition
     /// of 'icmp op load X, cst', try to see if we can compute the
@@ -483,7 +485,7 @@ namespace llvm {
     /// HowFarToZero - Return the number of times an exit condition comparing
     /// the specified value to zero will execute.  If not computable, return
     /// CouldNotCompute.
-    ExitLimit HowFarToZero(const SCEV *V, const Loop *L);
+    ExitLimit HowFarToZero(const SCEV *V, const Loop *L, bool IsSubExpr);
 
     /// HowFarToNonZero - Return the number of times an exit condition checking
     /// the specified value for nonzero will execute.  If not computable, return
@@ -495,7 +497,7 @@ namespace llvm {
     /// computable, return CouldNotCompute. isSigned specifies whether the
     /// less-than is signed.
     ExitLimit HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
-                               const Loop *L, bool isSigned);
+                               const Loop *L, bool isSigned, bool IsSubExpr);
 
     /// getPredecessorWithUniqueSuccessorForBB - Return a predecessor of BB
     /// (which may not be an immediate predecessor) which has exactly one
diff --git a/include/llvm/IR/IntrinsicsNVVM.td b/include/llvm/IR/IntrinsicsNVVM.td
index ebfd03e484..c248517def 100644
--- a/include/llvm/IR/IntrinsicsNVVM.td
+++ b/include/llvm/IR/IntrinsicsNVVM.td
@@ -405,6 +405,8 @@ def llvm_anyi64ptr_ty     : LLVMAnyPointerType<llvm_i64_ty>;     // (space)i64*
 // Sqrt
 //
 
+  def int_nvvm_sqrt_f : GCCBuiltin<"__nvvm_sqrt_f">,
+      Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_rn_ftz_f : GCCBuiltin<"__nvvm_sqrt_rn_ftz_f">,
       Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
   def int_nvvm_sqrt_rn_f : GCCBuiltin<"__nvvm_sqrt_rn_f">,
diff --git a/include/llvm/MC/MCELFObjectWriter.h b/include/llvm/MC/MCELFObjectWriter.h
index a59776d5cd..65dd1e8998 100644
--- a/include/llvm/MC/MCELFObjectWriter.h
+++ b/include/llvm/MC/MCELFObjectWriter.h
@@ -45,7 +45,14 @@ struct ELFRelocationEntry {
 
   // Support lexicographic sorting.
   bool operator<(const ELFRelocationEntry &RE) const {
-    return RE.r_offset < r_offset;
+    if (RE.r_offset != r_offset)
+      return RE.r_offset < r_offset;
+    if (Type != RE.Type)
+      return Type < RE.Type;
+    if (Index != RE.Index)
+      return Index < RE.Index;
+    llvm_unreachable("ELFRelocs might be unstable!");
+    return 0;
   }
 };
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 6ea915fdb0..f876748af3 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -3937,10 +3937,19 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
 /// before taking the branch. For loops with multiple exits, it may not be the
 /// number times that the loop header executes because the loop may exit
 /// prematurely via another branch.
+///
+/// FIXME: We conservatively call getBackedgeTakenCount(L) instead of
+/// getExitCount(L, ExitingBlock) to compute a safe trip count considering all
+/// loop exits. getExitCount() may return an exact count for this branch
+/// assuming no-signed-wrap. The number of well-defined iterations may actually
+/// be higher than this trip count if this exit test is skipped and the loop
+/// exits via a different branch. Ideally, getExitCount() would know whether it
+/// depends on a NSW assumption, and we would only fall back to a conservative
+/// trip count in that case.
 unsigned ScalarEvolution::
-getSmallConstantTripCount(Loop *L, BasicBlock *ExitingBlock) {
+getSmallConstantTripCount(Loop *L, BasicBlock */*ExitingBlock*/) {
   const SCEVConstant *ExitCount =
-    dyn_cast<SCEVConstant>(getExitCount(L, ExitingBlock));
+    dyn_cast<SCEVConstant>(getBackedgeTakenCount(L));
   if (!ExitCount)
     return 0;
 
@@ -3967,8 +3976,8 @@ getSmallConstantTripCount(Loop *L, BasicBlock *ExitingBlock) {
 /// As explained in the comments for getSmallConstantTripCount, this assumes
 /// that control exits the loop via ExitingBlock.
 unsigned ScalarEvolution::
-getSmallConstantTripMultiple(Loop *L, BasicBlock *ExitingBlock) {
-  const SCEV *ExitCount = getExitCount(L, ExitingBlock);
+getSmallConstantTripMultiple(Loop *L, BasicBlock */*ExitingBlock*/) {
+  const SCEV *ExitCount = getBackedgeTakenCount(L);
   if (ExitCount == getCouldNotCompute())
     return 1;
 
@@ -3997,7 +4006,7 @@ getSmallConstantTripMultiple(Loop *L, BasicBlock *ExitingBlock) {
 }
 
 // getExitCount - Get the expression for the number of loop iterations for which
-// this loop is guaranteed not to exit via ExitintBlock. Otherwise return
+// this loop is guaranteed not to exit via ExitingBlock. Otherwise return
 // SCEVCouldNotCompute.
 const SCEV *ScalarEvolution::getExitCount(Loop *L, BasicBlock *ExitingBlock) {
   return getBackedgeTakenInfo(L).getExact(ExitingBlock, this);
@@ -4382,26 +4391,36 @@ ScalarEvolution::ComputeExitLimit(const Loop *L, BasicBlock *ExitingBlock) {
   // Proceed to the next level to examine the exit condition expression.
   return ComputeExitLimitFromCond(L, ExitBr->getCondition(),
                                   ExitBr->getSuccessor(0),
-                                  ExitBr->getSuccessor(1));
+                                  ExitBr->getSuccessor(1),
+                                  /*IsSubExpr=*/false);
 }
 
 /// ComputeExitLimitFromCond - Compute the number of times the
 /// backedge of the specified loop will execute if its exit condition
 /// were a conditional branch of ExitCond, TBB, and FBB.
+///
+/// @param IsSubExpr is true if ExitCond does not directly control the exit
+/// branch. In this case, we cannot assume that the loop only exits when the
+/// condition is true and cannot infer that failing to meet the condition prior
+/// to integer wraparound results in undefined behavior.
 ScalarEvolution::ExitLimit
 ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
                                           Value *ExitCond,
                                           BasicBlock *TBB,
-                                          BasicBlock *FBB) {
+                                          BasicBlock *FBB,
+                                          bool IsSubExpr) {
   // Check if the controlling expression for this loop is an And or Or.
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(ExitCond)) {
     if (BO->getOpcode() == Instruction::And) {
       // Recurse on the operands of the and.
-      ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB);
-      ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB);
+      bool EitherMayExit = L->contains(TBB);
+      ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
+                                               IsSubExpr || EitherMayExit);
+      ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
+                                               IsSubExpr || EitherMayExit);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
-      if (L->contains(TBB)) {
+      if (EitherMayExit) {
         // Both conditions must be true for the loop to continue executing.
         // Choose the less conservative count.
         if (EL0.Exact == getCouldNotCompute() ||
@@ -4429,11 +4448,14 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
     }
     if (BO->getOpcode() == Instruction::Or) {
       // Recurse on the operands of the or.
-      ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB);
-      ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB);
+      bool EitherMayExit = L->contains(FBB);
+      ExitLimit EL0 = ComputeExitLimitFromCond(L, BO->getOperand(0), TBB, FBB,
+                                               IsSubExpr || EitherMayExit);
+      ExitLimit EL1 = ComputeExitLimitFromCond(L, BO->getOperand(1), TBB, FBB,
+                                               IsSubExpr || EitherMayExit);
       const SCEV *BECount = getCouldNotCompute();
       const SCEV *MaxBECount = getCouldNotCompute();
-      if (L->contains(FBB)) {
+      if (EitherMayExit) {
         // Both conditions must be false for the loop to continue executing.
         // Choose the less conservative count.
         if (EL0.Exact == getCouldNotCompute() ||
@@ -4464,7 +4486,7 @@ ScalarEvolution::ComputeExitLimitFromCond(const Loop *L,
   // With an icmp, it may be feasible to compute an exact backedge-taken count.
   // Proceed to the next level to examine the icmp.
   if (ICmpInst *ExitCondICmp = dyn_cast<ICmpInst>(ExitCond))
-    return ComputeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB);
+    return ComputeExitLimitFromICmp(L, ExitCondICmp, TBB, FBB, IsSubExpr);
 
   // Check for a constant condition. These are normally stripped out by
   // SimplifyCFG, but ScalarEvolution may be used by a pass which wishes to
@@ -4490,7 +4512,8 @@ ScalarEvolution::ExitLimit
 ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
                                           ICmpInst *ExitCond,
                                           BasicBlock *TBB,
-                                          BasicBlock *FBB) {
+                                          BasicBlock *FBB,
+                                          bool IsSubExpr) {
 
   // If the condition was exit on true, convert the condition to exit on false
   ICmpInst::Predicate Cond;
@@ -4542,7 +4565,7 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
   switch (Cond) {
   case ICmpInst::ICMP_NE: {                     // while (X != Y)
     // Convert to: while (X-Y != 0)
-    ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L);
+    ExitLimit EL = HowFarToZero(getMinusSCEV(LHS, RHS), L, IsSubExpr);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
@@ -4553,24 +4576,24 @@ ScalarEvolution::ComputeExitLimitFromICmp(const Loop *L,
     break;
   }
   case ICmpInst::ICMP_SLT: {
-    ExitLimit EL = HowManyLessThans(LHS, RHS, L, true);
+    ExitLimit EL = HowManyLessThans(LHS, RHS, L, true, IsSubExpr);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
   case ICmpInst::ICMP_SGT: {
     ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
-                                             getNotSCEV(RHS), L, true);
+                                    getNotSCEV(RHS), L, true, IsSubExpr);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
   case ICmpInst::ICMP_ULT: {
-    ExitLimit EL = HowManyLessThans(LHS, RHS, L, false);
+    ExitLimit EL = HowManyLessThans(LHS, RHS, L, false, IsSubExpr);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
   case ICmpInst::ICMP_UGT: {
     ExitLimit EL = HowManyLessThans(getNotSCEV(LHS),
-                                             getNotSCEV(RHS), L, false);
+                                    getNotSCEV(RHS), L, false, IsSubExpr);
     if (EL.hasAnyInfo()) return EL;
     break;
   }
@@ -5439,7 +5462,7 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
 /// effectively V != 0.  We know and take advantage of the fact that this
 /// expression only being used in a comparison by zero context.
 ScalarEvolution::ExitLimit
-ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
+ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L, bool IsSubExpr) {
   // If the value is a constant
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(V)) {
     // If the value is already zero, the branch will execute zero times.
@@ -5537,19 +5560,20 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
   }
 
   // If the recurrence is known not to wraparound, unsigned divide computes the
-  // back edge count. We know that the value will either become zero (and thus
-  // the loop terminates), that the loop will terminate through some other exit
-  // condition first, or that the loop has undefined behavior.  This means
-  // we can't "miss" the exit value, even with nonunit stride.
+  // back edge count. (Ideally we would have an "isexact" bit for udiv). We know
+  // that the value will either become zero (and thus the loop terminates), that
+  // the loop will terminate through some other exit condition first, or that
+  // the loop has undefined behavior.  This means we can't "miss" the exit
+  // value, even with nonunit stride.
   //
-  // FIXME: Prove that loops always exhibits *acceptable* undefined
-  // behavior. Loops must exhibit defined behavior until a wrapped value is
-  // actually used. So the trip count computed by udiv could be smaller than the
-  // number of well-defined iterations.
-  if (AddRec->getNoWrapFlags(SCEV::FlagNW)) {
-    // FIXME: We really want an "isexact" bit for udiv.
+  // This is only valid for expressions that directly compute the loop exit. It
+  // is invalid for subexpressions in which the loop may exit through this
+  // branch even if this subexpression is false. In that case, the trip count
+  // computed by this udiv could be smaller than the number of well-defined
+  // iterations.
+  if (!IsSubExpr && AddRec->getNoWrapFlags(SCEV::FlagNW))
     return getUDivExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step);
-  }
+
   // Then, try to solve the above equation provided that Start is constant.
   if (const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start))
     return SolveLinEquationWithOverflow(StepC->getValue()->getValue(),
@@ -6315,9 +6339,14 @@ const SCEV *ScalarEvolution::getBECount(const SCEV *Start,
 /// HowManyLessThans - Return the number of times a backedge containing the
 /// specified less-than comparison will execute.  If not computable, return
 /// CouldNotCompute.
+///
+/// @param IsSubExpr is true when the LHS < RHS condition does not directly
+/// control the branch. In this case, we can only compute an iteration count for
+/// a subexpression that cannot overflow before evaluating true.
 ScalarEvolution::ExitLimit
 ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
-                                  const Loop *L, bool isSigned) {
+                                  const Loop *L, bool isSigned,
+                                  bool IsSubExpr) {
   // Only handle:  "ADDREC < LoopInvariant".
   if (!isLoopInvariant(RHS, L)) return getCouldNotCompute();
 
@@ -6326,10 +6355,12 @@ ScalarEvolution::HowManyLessThans(const SCEV *LHS, const SCEV *RHS,
     return getCouldNotCompute();
 
   // Check to see if we have a flag which makes analysis easy.
-  bool NoWrap = isSigned ?
-    AddRec->getNoWrapFlags((SCEV::NoWrapFlags)(SCEV::FlagNSW | SCEV::FlagNW)) :
-    AddRec->getNoWrapFlags((SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNW));
-
+  bool NoWrap = false;
+  if (!IsSubExpr) {
+    NoWrap = AddRec->getNoWrapFlags(
+      (SCEV::NoWrapFlags)(((isSigned ? SCEV::FlagNSW : SCEV::FlagNUW))
+                          | SCEV::FlagNW));
+  }
   if (AddRec->isAffine()) {
     unsigned BitWidth = getTypeSizeInBits(AddRec->getType());
     const SCEV *Step = AddRec->getStepRecurrence(*this);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index aeaa63f2af..73bba6989f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -708,6 +708,12 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
   Asm->OutStreamer.getContext().setMCLineTableSymbol(LineTableStartSym,
                                                      NewCU->getUniqueID());
 
+  // Use a single line table if we are using .loc and generating assembly.
+  bool UseTheFirstCU =
+    (Asm->TM.hasMCUseLoc() &&
+     Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer) ||
+    (NewCU->getUniqueID() == 0);
+
   // DW_AT_stmt_list is a offset of line number information for this
   // compile unit in debug_line section. For split dwarf this is
   // left in the skeleton CU and so not included.
@@ -716,9 +722,9 @@ CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
   if (!useSplitDwarf()) {
     if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
       NewCU->addLabel(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
-                      NewCU->getUniqueID() == 0 ?
+                      UseTheFirstCU ?
                       Asm->GetTempSymbol("section_line") : LineTableStartSym);
-    else if (NewCU->getUniqueID() == 0)
+    else if (UseTheFirstCU)
       NewCU->addUInt(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4, 0);
     else
       NewCU->addDelta(Die, dwarf::DW_AT_stmt_list, dwarf::DW_FORM_data4,
@@ -1441,7 +1447,12 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
   CompileUnit *TheCU = SPMap.lookup(FnScope->getScopeNode());
   assert(TheCU && "Unable to find compile unit!");
-  Asm->OutStreamer.getContext().setDwarfCompileUnitID(TheCU->getUniqueID());
+  if (Asm->TM.hasMCUseLoc() &&
+      Asm->OutStreamer.getKind() == MCStreamer::SK_AsmStreamer)
+    // Use a single line table if we are using .loc and generating assembly.
+    Asm->OutStreamer.getContext().setDwarfCompileUnitID(0);
+  else
+    Asm->OutStreamer.getContext().setDwarfCompileUnitID(TheCU->getUniqueID());
 
   FunctionBeginSym = Asm->GetTempSymbol("func_begin",
                                         Asm->getFunctionNumber());
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index 07f0ccf52f..d894f664dc 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -453,6 +453,12 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 1));
     break;
 
+  case Intrinsic::annotation:
+  case Intrinsic::ptr_annotation:
+    // Just drop the annotation, but forward the value
+    CI->replaceAllUsesWith(CI->getOperand(0));
+    break;
+
   case Intrinsic::var_annotation:
     break;   // Strip out annotate intrinsic
     
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 2ded723ca0..67db211ec4 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5034,6 +5034,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     setValue(&I, Res);
     return 0;
   }
+  case Intrinsic::annotation:
+  case Intrinsic::ptr_annotation:
+    // Drop the intrinsic, but forward the value
+    setValue(&I, getValue(I.getOperand(0)));
+    return 0;
   case Intrinsic::var_annotation:
     // Discard annotate attributes
     return 0;
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 2368e9e86a..edefdb4c36 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -1092,7 +1092,7 @@ bool AsmParser::ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
     MCBinaryExpr::Opcode Dummy;
     unsigned NextTokPrec = getBinOpPrecedence(Lexer.getKind(), Dummy);
     if (TokPrec < NextTokPrec) {
-      if (ParseBinOpRHS(Precedence+1, RHS, EndLoc)) return true;
+      if (ParseBinOpRHS(TokPrec+1, RHS, EndLoc)) return true;
     }
 
     // Merge LHS and RHS according to operator.
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 4839c3470c..8f1895e048 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -35,6 +35,7 @@ MCStreamer::~MCStreamer() {
 void MCStreamer::reset() {
   for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i)
     delete W64UnwindInfos[i];
+  W64UnwindInfos.clear();
   EmitEHFrame = true;
   EmitDebugFrame = false;
   CurrentW64UnwindInfo = 0;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 0f7beb1e3b..e49cfc4985 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -5257,6 +5257,23 @@ static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
   return false;
 }
 
+static EVT getExtensionTo64Bits(const EVT &OrigVT) {
+  if (OrigVT.getSizeInBits() >= 64)
+    return OrigVT;
+
+  assert(OrigVT.isSimple() && "Expecting a simple value type");
+
+  MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
+  switch (OrigSimpleTy) {
+  default: llvm_unreachable("Unexpected Vector Type");
+  case MVT::v2i8:
+  case MVT::v2i16:
+     return MVT::v2i32;
+  case MVT::v4i8:
+    return  MVT::v4i16;
+  }
+}
+
 /// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
 /// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
 /// We insert the required extension here to get the vector to fill a D register.
@@ -5272,18 +5289,8 @@ static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
     return N;
 
   // Must extend size to at least 64 bits to be used as an operand for VMULL.
-  MVT::SimpleValueType OrigSimpleTy = OrigTy.getSimpleVT().SimpleTy;
-  EVT NewVT;
-  switch (OrigSimpleTy) {
-  default: llvm_unreachable("Unexpected Orig Vector Type");
-  case MVT::v2i8:
-  case MVT::v2i16:
-    NewVT = MVT::v2i32;
-    break;
-  case MVT::v4i8:
-    NewVT = MVT::v4i16;
-    break;
-  }
+  EVT NewVT = getExtensionTo64Bits(OrigTy);
+
   return DAG.getNode(ExtOpcode, N->getDebugLoc(), NewVT, N);
 }
 
@@ -5293,22 +5300,22 @@ static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG,
 /// reach a total size of 64 bits. We have to add the extension separately
 /// because ARM does not have a sign/zero extending load for vectors.
 static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG& DAG) {
-  SDValue NonExtendingLoad =
-    DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(),
+  EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
+
+  // The load already has the right type.
+  if (ExtendedTy == LD->getMemoryVT())
+    return DAG.getLoad(LD->getMemoryVT(), LD->getDebugLoc(), LD->getChain(),
                 LD->getBasePtr(), LD->getPointerInfo(), LD->isVolatile(),
                 LD->isNonTemporal(), LD->isInvariant(),
                 LD->getAlignment());
-  unsigned ExtOp = 0;
-  switch (LD->getExtensionType()) {
-  default: llvm_unreachable("Unexpected LoadExtType");
-  case ISD::EXTLOAD:
-  case ISD::SEXTLOAD: ExtOp = ISD::SIGN_EXTEND; break;
-  case ISD::ZEXTLOAD: ExtOp = ISD::ZERO_EXTEND; break;
-  }
-  MVT::SimpleValueType MemType = LD->getMemoryVT().getSimpleVT().SimpleTy;
-  MVT::SimpleValueType ExtType = LD->getValueType(0).getSimpleVT().SimpleTy;
-  return AddRequiredExtensionForVMULL(NonExtendingLoad, DAG,
-                                      MemType, ExtType, ExtOp);
+
+  // We need to create a zextload/sextload. We cannot just create a load
+  // followed by a zext/zext node because LowerMUL is also run during normal
+  // operation legalization where we can't create illegal types.
+  return DAG.getExtLoad(LD->getExtensionType(), LD->getDebugLoc(), ExtendedTy,
+                        LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
+                        LD->getMemoryVT(), LD->isVolatile(),
+                        LD->isNonTemporal(), LD->getAlignment());
 }
 
 /// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index 7da2fed4cd..735ca9b6b7 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -23,6 +23,7 @@ set(NVPTXCodeGen_sources
   NVPTXAsmPrinter.cpp
   NVPTXUtilities.cpp
   NVVMReflect.cpp
+  NVPTXGenericToNVVM.cpp
   )
 
 add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 6a53a443bf..072c65da35 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -16,6 +16,7 @@
 #define LLVM_TARGET_NVPTX_H
 
 #include "MCTargetDesc/NVPTXBaseInfo.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -62,6 +63,9 @@ createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel);
 FunctionPass *createLowerStructArgsPass(NVPTXTargetMachine &);
 FunctionPass *createNVPTXReMatPass(NVPTXTargetMachine &);
 FunctionPass *createNVPTXReMatBlockPass(NVPTXTargetMachine &);
+ModulePass *createGenericToNVVMPass();
+ModulePass *createNVVMReflectPass();
+ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
 
 bool isImageOrSamplerVal(const Value *, const Module *);
 
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index ce5d78afa3..229e4e5980 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -68,11 +68,12 @@ InterleaveSrc("nvptx-emit-src", cl::ZeroOrMore,
 namespace {
 /// DiscoverDependentGlobals - Return a set of GlobalVariables on which \p V
 /// depends.
-void DiscoverDependentGlobals(Value *V, DenseSet<GlobalVariable *> &Globals) {
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+void DiscoverDependentGlobals(const Value *V,
+                              DenseSet<const GlobalVariable *> &Globals) {
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
     Globals.insert(GV);
   else {
-    if (User *U = dyn_cast<User>(V)) {
+    if (const User *U = dyn_cast<User>(V)) {
       for (unsigned i = 0, e = U->getNumOperands(); i != e; ++i) {
         DiscoverDependentGlobals(U->getOperand(i), Globals);
       }
@@ -84,8 +85,9 @@ void DiscoverDependentGlobals(Value *V, DenseSet<GlobalVariable *> &Globals) {
 /// instances to be emitted, but only after any dependents have been added
 /// first.
 void VisitGlobalVariableForEmission(
-    GlobalVariable *GV, SmallVectorImpl<GlobalVariable *> &Order,
-    DenseSet<GlobalVariable *> &Visited, DenseSet<GlobalVariable *> &Visiting) {
+    const GlobalVariable *GV, SmallVectorImpl<const GlobalVariable *> &Order,
+    DenseSet<const GlobalVariable *> &Visited,
+    DenseSet<const GlobalVariable *> &Visiting) {
   // Have we already visited this one?
   if (Visited.count(GV))
     return;
@@ -98,12 +100,12 @@ void VisitGlobalVariableForEmission(
   Visiting.insert(GV);
 
   // Make sure we visit all dependents first
-  DenseSet<GlobalVariable *> Others;
+  DenseSet<const GlobalVariable *> Others;
   for (unsigned i = 0, e = GV->getNumOperands(); i != e; ++i)
     DiscoverDependentGlobals(GV->getOperand(i), Others);
 
-  for (DenseSet<GlobalVariable *>::iterator I = Others.begin(),
-                                            E = Others.end();
+  for (DenseSet<const GlobalVariable *>::iterator I = Others.begin(),
+                                                  E = Others.end();
        I != E; ++I)
     VisitGlobalVariableForEmission(*I, Order, Visited, Visiting);
 
@@ -405,6 +407,11 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
   SmallString<128> Str;
   raw_svector_ostream O(Str);
 
+  if (!GlobalsEmitted) {
+    emitGlobals(*MF->getFunction()->getParent());
+    GlobalsEmitted = true;
+  }
+  
   // Set up
   MRI = &MF->getRegInfo();
   F = MF->getFunction();
@@ -695,7 +702,7 @@ void NVPTXAsmPrinter::emitDeclaration(const Function *F, raw_ostream &O) {
   else
     O << ".func ";
   printReturnValStr(F, O);
-  O << *CurrentFnSym << "\n";
+  O << *Mang->getSymbol(F) << "\n";
   emitFunctionParamList(F, O);
   O << ";\n";
 }
@@ -795,7 +802,7 @@ static bool useFuncSeen(const Constant *C,
   return false;
 }
 
-void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) {
+void NVPTXAsmPrinter::emitDeclarations(const Module &M, raw_ostream &O) {
   llvm::DenseMap<const Function *, bool> seenMap;
   for (Module::const_iterator FI = M.begin(), FE = M.end(); FI != FE; ++FI) {
     const Function *F = FI;
@@ -805,7 +812,6 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) {
         continue;
       if (F->getIntrinsicID())
         continue;
-      CurrentFnSym = Mang->getSymbol(F);
       emitDeclaration(F, O);
       continue;
     }
@@ -817,14 +823,12 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) {
           // The use is in the initialization of a global variable
           // that is a function pointer, so print a declaration
           // for the original function
-          CurrentFnSym = Mang->getSymbol(F);
           emitDeclaration(F, O);
           break;
         }
         // Emit a declaration of this function if the function that
         // uses this constant expr has already been seen.
         if (useFuncSeen(C, seenMap)) {
-          CurrentFnSym = Mang->getSymbol(F);
           emitDeclaration(F, O);
           break;
         }
@@ -844,7 +848,6 @@ void NVPTXAsmPrinter::emitDeclarations(Module &M, raw_ostream &O) {
       // appearing in the module before the callee. so print out
       // a declaration for the callee.
       if (seenMap.find(caller) != seenMap.end()) {
-        CurrentFnSym = Mang->getSymbol(F);
         emitDeclaration(F, O);
         break;
       }
@@ -921,6 +924,12 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   if (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA)
     recordAndEmitFilenames(M);
 
+  GlobalsEmitted = false;
+    
+  return false; // success
+}
+
+void NVPTXAsmPrinter::emitGlobals(const Module &M) {
   SmallString<128> Str2;
   raw_svector_ostream OS2(Str2);
 
@@ -931,13 +940,13 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   // global variable in order, and ensure that we emit it *after* its dependent
   // globals. We use a little extra memory maintaining both a set and a list to
   // have fast searches while maintaining a strict ordering.
-  SmallVector<GlobalVariable *, 8> Globals;
-  DenseSet<GlobalVariable *> GVVisited;
-  DenseSet<GlobalVariable *> GVVisiting;
+  SmallVector<const GlobalVariable *, 8> Globals;
+  DenseSet<const GlobalVariable *> GVVisited;
+  DenseSet<const GlobalVariable *> GVVisiting;
 
   // Visit each global variable, in order
-  for (Module::global_iterator I = M.global_begin(), E = M.global_end(); I != E;
-       ++I)
+  for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+       I != E; ++I)
     VisitGlobalVariableForEmission(I, Globals, GVVisited, GVVisiting);
 
   assert(GVVisited.size() == M.getGlobalList().size() &&
@@ -951,7 +960,6 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
   OS2 << '\n';
 
   OutStreamer.EmitRawText(OS2.str());
-  return false; // success
 }
 
 void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) {
@@ -989,6 +997,14 @@ void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O) {
 }
 
 bool NVPTXAsmPrinter::doFinalization(Module &M) {
+
+  // If we did not emit any functions, then the global declarations have not
+  // yet been emitted.
+  if (!GlobalsEmitted) {
+    emitGlobals(M);
+    GlobalsEmitted = true;
+  }
+
   // XXX Temproarily remove global variables so that doFinalization() will not
   // emit them again (global variables are emitted at beginning).
 
@@ -1063,7 +1079,8 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
   }
 }
 
-void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
+void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
+                                         raw_ostream &O,
                                          bool processDemoted) {
 
   // Skip meta data
@@ -1107,10 +1124,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
   if (llvm::isSampler(*GVar)) {
     O << ".global .samplerref " << llvm::getSamplerName(*GVar);
 
-    Constant *Initializer = NULL;
+    const Constant *Initializer = NULL;
     if (GVar->hasInitializer())
       Initializer = GVar->getInitializer();
-    ConstantInt *CI = NULL;
+    const ConstantInt *CI = NULL;
     if (Initializer)
       CI = dyn_cast<ConstantInt>(Initializer);
     if (CI) {
@@ -1183,7 +1200,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
     if (localDecls.find(demotedFunc) != localDecls.end())
       localDecls[demotedFunc].push_back(GVar);
     else {
-      std::vector<GlobalVariable *> temp;
+      std::vector<const GlobalVariable *> temp;
       temp.push_back(GVar);
       localDecls[demotedFunc] = temp;
     }
@@ -1199,7 +1216,11 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
 
   if (ETy->isPrimitiveType() || ETy->isIntegerTy() || isa<PointerType>(ETy)) {
     O << " .";
-    O << getPTXFundamentalTypeStr(ETy, false);
+    // Special case: ABI requires that we use .u8 for predicates
+    if (ETy->isIntegerTy(1))
+      O << "u8";
+    else
+      O << getPTXFundamentalTypeStr(ETy, false);
     O << " ";
     O << *Mang->getSymbol(GVar);
 
@@ -1209,7 +1230,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
          (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
          (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
         GVar->hasInitializer()) {
-      Constant *Initializer = GVar->getInitializer();
+      const Constant *Initializer = GVar->getInitializer();
       if (!Initializer->isNullValue()) {
         O << " = ";
         printScalarConstant(Initializer, O);
@@ -1233,7 +1254,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O,
            (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST_NOT_GEN) ||
            (PTy->getAddressSpace() == llvm::ADDRESS_SPACE_CONST)) &&
           GVar->hasInitializer()) {
-        Constant *Initializer = GVar->getInitializer();
+        const Constant *Initializer = GVar->getInitializer();
         if (!isa<UndefValue>(Initializer) && !Initializer->isNullValue()) {
           AggBuffer aggBuffer(ElementSize, O, *this);
           bufferAggregateConstant(Initializer, &aggBuffer);
@@ -1283,7 +1304,7 @@ void NVPTXAsmPrinter::emitDemotedVars(const Function *f, raw_ostream &O) {
   if (localDecls.find(f) == localDecls.end())
     return;
 
-  std::vector<GlobalVariable *> &gvars = localDecls[f];
+  std::vector<const GlobalVariable *> &gvars = localDecls[f];
 
   for (unsigned i = 0, e = gvars.size(); i != e; ++i) {
     O << "\t// demoted variable\n\t";
@@ -1448,7 +1469,7 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
                                      int paramIndex, raw_ostream &O) {
   if ((nvptxSubtarget.getDrvInterface() == NVPTX::NVCL) ||
       (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA))
-    O << *CurrentFnSym << "_param_" << paramIndex;
+    O << *Mang->getSymbol(I->getParent()) << "_param_" << paramIndex;
   else {
     std::string argName = I->getName();
     const char *p = argName.c_str();
@@ -1507,11 +1528,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
       if (llvm::isImage(*I)) {
         std::string sname = I->getName();
         if (llvm::isImageWriteOnly(*I))
-          O << "\t.param .surfref " << *CurrentFnSym << "_param_" << paramIndex;
+          O << "\t.param .surfref " << *Mang->getSymbol(F) << "_param_"
+            << paramIndex;
         else // Default image is read_only
-          O << "\t.param .texref " << *CurrentFnSym << "_param_" << paramIndex;
+          O << "\t.param .texref " << *Mang->getSymbol(F) << "_param_"
+            << paramIndex;
       } else // Should be llvm::isSampler(*I)
-        O << "\t.param .samplerref " << *CurrentFnSym << "_param_"
+        O << "\t.param .samplerref " << *Mang->getSymbol(F) << "_param_"
           << paramIndex;
       continue;
     }
@@ -1564,7 +1587,13 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
         }
 
         // non-pointer scalar to kernel func
-        O << "\t.param ." << getPTXFundamentalTypeStr(Ty) << " ";
+        O << "\t.param .";
+        // Special case: predicate operands become .u8 types
+        if (Ty->isIntegerTy(1))
+          O << "u8";
+        else
+          O << getPTXFundamentalTypeStr(Ty);
+        O << " ";
         printParamName(I, paramIndex, O);
         continue;
       }
@@ -1751,12 +1780,12 @@ void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) {
   O << utohexstr(API.getZExtValue());
 }
 
-void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) {
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
+void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CPV)) {
     O << CI->getValue();
     return;
   }
-  if (ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) {
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV)) {
     printFPConstant(CFP, O);
     return;
   }
@@ -1764,13 +1793,13 @@ void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) {
     O << "0";
     return;
   }
-  if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+  if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
     O << *Mang->getSymbol(GVar);
     return;
   }
-  if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-    Value *v = Cexpr->stripPointerCasts();
-    if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+  if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+    const Value *v = Cexpr->stripPointerCasts();
+    if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
       O << *Mang->getSymbol(GVar);
       return;
     } else {
@@ -1781,7 +1810,7 @@ void NVPTXAsmPrinter::printScalarConstant(Constant *CPV, raw_ostream &O) {
   llvm_unreachable("Not scalar type found in printScalarConstant()");
 }
 
-void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
+void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
                                    AggBuffer *aggBuffer) {
 
   const DataLayout *TD = TM.getDataLayout();
@@ -1809,13 +1838,13 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
       ptr = (unsigned char *)&int16;
       aggBuffer->addBytes(ptr, 2, Bytes);
     } else if (ETy == Type::getInt32Ty(CPV->getContext())) {
-      if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+      if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
         int int32 = (int)(constInt->getZExtValue());
         ptr = (unsigned char *)&int32;
         aggBuffer->addBytes(ptr, 4, Bytes);
         break;
-      } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-        if (ConstantInt *constInt = dyn_cast<ConstantInt>(
+      } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
                 ConstantFoldConstantExpression(Cexpr, TD))) {
           int int32 = (int)(constInt->getZExtValue());
           ptr = (unsigned char *)&int32;
@@ -1831,13 +1860,13 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
       }
       llvm_unreachable("unsupported integer const type");
     } else if (ETy == Type::getInt64Ty(CPV->getContext())) {
-      if (ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
+      if (const ConstantInt *constInt = dyn_cast<ConstantInt>(CPV)) {
         long long int64 = (long long)(constInt->getZExtValue());
         ptr = (unsigned char *)&int64;
         aggBuffer->addBytes(ptr, 8, Bytes);
         break;
-      } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-        if (ConstantInt *constInt = dyn_cast<ConstantInt>(
+      } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+        if (const ConstantInt *constInt = dyn_cast<ConstantInt>(
                 ConstantFoldConstantExpression(Cexpr, TD))) {
           long long int64 = (long long)(constInt->getZExtValue());
           ptr = (unsigned char *)&int64;
@@ -1858,7 +1887,7 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
   }
   case Type::FloatTyID:
   case Type::DoubleTyID: {
-    ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
+    const ConstantFP *CFP = dyn_cast<ConstantFP>(CPV);
     const Type *Ty = CFP->getType();
     if (Ty == Type::getFloatTy(CPV->getContext())) {
       float float32 = (float) CFP->getValueAPF().convertToFloat();
@@ -1874,10 +1903,10 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
     break;
   }
   case Type::PointerTyID: {
-    if (GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
+    if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
       aggBuffer->addSymbol(GVar);
-    } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-      Value *v = Cexpr->stripPointerCasts();
+    } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
+      const Value *v = Cexpr->stripPointerCasts();
       aggBuffer->addSymbol(v);
     }
     unsigned int s = TD->getTypeAllocSize(CPV->getType());
@@ -1906,7 +1935,7 @@ void NVPTXAsmPrinter::bufferLEByte(Constant *CPV, int Bytes,
   }
 }
 
-void NVPTXAsmPrinter::bufferAggregateConstant(Constant *CPV,
+void NVPTXAsmPrinter::bufferAggregateConstant(const Constant *CPV,
                                               AggBuffer *aggBuffer) {
   const DataLayout *TD = TM.getDataLayout();
   int Bytes;
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 6dc9fc0ffe..7faa6b265b 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -91,7 +91,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
     unsigned char *buffer; // the buffer
     unsigned numSymbols;   // number of symbol addresses
     SmallVector<unsigned, 4> symbolPosInBuffer;
-    SmallVector<Value *, 4> Symbols;
+    SmallVector<const Value *, 4> Symbols;
 
   private:
     unsigned curpos;
@@ -128,7 +128,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
       }
       return curpos;
     }
-    void addSymbol(Value *GVar) {
+    void addSymbol(const Value *GVar) {
       symbolPosInBuffer.push_back(curpos);
       Symbols.push_back(GVar);
       numSymbols++;
@@ -153,11 +153,11 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
           if (pos)
             O << ", ";
           if (pos == nextSymbolPos) {
-            Value *v = Symbols[nSym];
-            if (GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
+            const Value *v = Symbols[nSym];
+            if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
               MCSymbol *Name = AP.Mang->getSymbol(GVar);
               O << *Name;
-            } else if (ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
+            } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
               O << *nvptx::LowerConstant(Cexpr, AP);
             } else
               llvm_unreachable("symbol type unknown");
@@ -205,10 +205,12 @@ private:
   void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
   // definition autogenerated.
   void printInstruction(const MachineInstr *MI, raw_ostream &O);
-  void printModuleLevelGV(GlobalVariable *GVar, raw_ostream &O, bool = false);
+  void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
+                          bool = false);
   void printParamName(int paramIndex, raw_ostream &O);
   void printParamName(Function::const_arg_iterator I, int paramIndex,
                       raw_ostream &O);
+  void emitGlobals(const Module &M);
   void emitHeader(Module &M, raw_ostream &O);
   void emitKernelFunctionDirectives(const Function &F, raw_ostream &O) const;
   void emitVirtualRegister(unsigned int vr, bool isVec, raw_ostream &O);
@@ -234,6 +236,8 @@ protected:
 private:
   std::string CurrentBankselLabelInBasicBlock;
 
+  bool GlobalsEmitted;
+  
   // This is specific per MachineFunction.
   const MachineRegisterInfo *MRI;
   // The contents are specific for each
@@ -247,7 +251,7 @@ private:
   std::map<const Type *, std::string> TypeNameMap;
 
   // List of variables demoted to a function scope.
-  std::map<const Function *, std::vector<GlobalVariable *> > localDecls;
+  std::map<const Function *, std::vector<const GlobalVariable *> > localDecls;
 
   // To record filename to ID mapping
   std::map<std::string, unsigned> filenameMap;
@@ -256,15 +260,15 @@ private:
   void emitPTXGlobalVariable(const GlobalVariable *GVar, raw_ostream &O);
   void emitPTXAddressSpace(unsigned int AddressSpace, raw_ostream &O) const;
   std::string getPTXFundamentalTypeStr(const Type *Ty, bool = true) const;
-  void printScalarConstant(Constant *CPV, raw_ostream &O);
+  void printScalarConstant(const Constant *CPV, raw_ostream &O);
   void printFPConstant(const ConstantFP *Fp, raw_ostream &O);
-  void bufferLEByte(Constant *CPV, int Bytes, AggBuffer *aggBuffer);
-  void bufferAggregateConstant(Constant *CV, AggBuffer *aggBuffer);
+  void bufferLEByte(const Constant *CPV, int Bytes, AggBuffer *aggBuffer);
+  void bufferAggregateConstant(const Constant *CV, AggBuffer *aggBuffer);
 
   void printOperandProper(const MachineOperand &MO);
 
   void emitLinkageDirective(const GlobalValue *V, raw_ostream &O);
-  void emitDeclarations(Module &, raw_ostream &O);
+  void emitDeclarations(const Module &, raw_ostream &O);
   void emitDeclaration(const Function *, raw_ostream &O);
 
   static const char *getRegisterName(unsigned RegNo);
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
new file mode 100644
index 0000000000..1077c46fb4
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -0,0 +1,436 @@
+//===-- GenericToNVVM.cpp - Convert generic module to NVVM module - C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Convert generic global variables into either .global or .const access based
+// on the variable's "constant" qualifier.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "MCTargetDesc/NVPTXBaseInfo.h"
+
+#include "llvm/PassManager.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/IRBuilder.h"
+
+using namespace llvm;
+
+namespace llvm {
+void initializeGenericToNVVMPass(PassRegistry &);
+}
+
+namespace {
+class GenericToNVVM : public ModulePass {
+public:
+  static char ID;
+
+  GenericToNVVM() : ModulePass(ID) {}
+
+  virtual bool runOnModule(Module &M);
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  }
+
+private:
+  Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV,
+                         IRBuilder<> &Builder);
+  Value *remapConstant(Module *M, Function *F, Constant *C,
+                       IRBuilder<> &Builder);
+  Value *remapConstantVectorOrConstantAggregate(Module *M, Function *F,
+                                                Constant *C,
+                                                IRBuilder<> &Builder);
+  Value *remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+                           IRBuilder<> &Builder);
+  void remapNamedMDNode(Module *M, NamedMDNode *N);
+  MDNode *remapMDNode(Module *M, MDNode *N);
+
+  typedef ValueMap<GlobalVariable *, GlobalVariable *> GVMapTy;
+  typedef ValueMap<Constant *, Value *> ConstantToValueMapTy;
+  GVMapTy GVMap;
+  ConstantToValueMapTy ConstantToValueMap;
+};
+}
+
+char GenericToNVVM::ID = 0;
+
+ModulePass *llvm::createGenericToNVVMPass() { return new GenericToNVVM(); }
+
+INITIALIZE_PASS(
+    GenericToNVVM, "generic-to-nvvm",
+    "Ensure that the global variables are in the global address space", false,
+    false)
+
+bool GenericToNVVM::runOnModule(Module &M) {
+  // Create a clone of each global variable that has the default address space.
+  // The clone is created with the global address space  specifier, and the pair
+  // of original global variable and its clone is placed in the GVMap for later
+  // use.
+
+  for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+       I != E;) {
+    GlobalVariable *GV = I++;
+    if (GV->getType()->getAddressSpace() == llvm::ADDRESS_SPACE_GENERIC &&
+        !llvm::isTexture(*GV) && !llvm::isSurface(*GV) &&
+        !GV->getName().startswith("llvm.")) {
+      GlobalVariable *NewGV = new GlobalVariable(
+          M, GV->getType()->getElementType(), GV->isConstant(),
+          GV->getLinkage(), GV->hasInitializer() ? GV->getInitializer() : NULL,
+          "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
+      NewGV->copyAttributesFrom(GV);
+      GVMap[GV] = NewGV;
+    }
+  }
+
+  // Return immediately, if every global variable has a specific address space
+  // specifier.
+  if (GVMap.empty()) {
+    return false;
+  }
+
+  // Walk through the instructions in function defitinions, and replace any use
+  // of original global variables in GVMap with a use of the corresponding
+  // copies in GVMap.  If necessary, promote constants to instructions.
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
+    if (I->isDeclaration()) {
+      continue;
+    }
+    IRBuilder<> Builder(I->getEntryBlock().getFirstNonPHIOrDbg());
+    for (Function::iterator BBI = I->begin(), BBE = I->end(); BBI != BBE;
+         ++BBI) {
+      for (BasicBlock::iterator II = BBI->begin(), IE = BBI->end(); II != IE;
+           ++II) {
+        for (unsigned i = 0, e = II->getNumOperands(); i < e; ++i) {
+          Value *Operand = II->getOperand(i);
+          if (isa<Constant>(Operand)) {
+            II->setOperand(
+                i, remapConstant(&M, I, cast<Constant>(Operand), Builder));
+          }
+        }
+      }
+    }
+    ConstantToValueMap.clear();
+  }
+
+  // Walk through the metadata section and update the debug information
+  // associated with the global variables in the default address space.
+  for (Module::named_metadata_iterator I = M.named_metadata_begin(),
+                                       E = M.named_metadata_end();
+       I != E; I++) {
+    remapNamedMDNode(&M, I);
+  }
+
+  // Walk through the global variable  initializers, and replace any use of
+  // original global variables in GVMap with a use of the corresponding copies
+  // in GVMap.  The copies need to be bitcast to the original global variable
+  // types, as we cannot use cvta in global variable initializers.
+  for (GVMapTy::iterator I = GVMap.begin(), E = GVMap.end(); I != E;) {
+    GlobalVariable *GV = I->first;
+    GlobalVariable *NewGV = I->second;
+    ++I;
+    Constant *BitCastNewGV = ConstantExpr::getBitCast(NewGV, GV->getType());
+    // At this point, the remaining uses of GV should be found only in global
+    // variable initializers, as other uses have been already been removed
+    // while walking through the instructions in function definitions.
+    for (Value::use_iterator UI = GV->use_begin(), UE = GV->use_end();
+         UI != UE;) {
+      Use &U = (UI++).getUse();
+      U.set(BitCastNewGV);
+    }
+    std::string Name = GV->getName();
+    GV->removeDeadConstantUsers();
+    GV->eraseFromParent();
+    NewGV->setName(Name);
+  }
+  GVMap.clear();
+
+  return true;
+}
+
+Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
+                                      GlobalVariable *GV,
+                                      IRBuilder<> &Builder) {
+  PointerType *GVType = GV->getType();
+  Value *CVTA = NULL;
+
+  // See if the address space conversion requires the operand to be bitcast
+  // to i8 addrspace(n)* first.
+  EVT ExtendedGVType = EVT::getEVT(GVType->getElementType(), true);
+  if (!ExtendedGVType.isInteger() && !ExtendedGVType.isFloatingPoint()) {
+    // A bitcast to i8 addrspace(n)* on the operand is needed.
+    LLVMContext &Context = M->getContext();
+    unsigned int AddrSpace = GVType->getAddressSpace();
+    Type *DestTy = PointerType::get(Type::getInt8Ty(Context), AddrSpace);
+    CVTA = Builder.CreateBitCast(GV, DestTy, "cvta");
+    // Insert the address space conversion.
+    Type *ResultType =
+        PointerType::get(Type::getInt8Ty(Context), llvm::ADDRESS_SPACE_GENERIC);
+    SmallVector<Type *, 2> ParamTypes;
+    ParamTypes.push_back(ResultType);
+    ParamTypes.push_back(DestTy);
+    Function *CVTAFunction = Intrinsic::getDeclaration(
+        M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+    CVTA = Builder.CreateCall(CVTAFunction, CVTA, "cvta");
+    // Another bitcast from i8 * to <the element type of GVType> * is
+    // required.
+    DestTy =
+        PointerType::get(GVType->getElementType(), llvm::ADDRESS_SPACE_GENERIC);
+    CVTA = Builder.CreateBitCast(CVTA, DestTy, "cvta");
+  } else {
+    // A simple CVTA is enough.
+    SmallVector<Type *, 2> ParamTypes;
+    ParamTypes.push_back(PointerType::get(GVType->getElementType(),
+                                          llvm::ADDRESS_SPACE_GENERIC));
+    ParamTypes.push_back(GVType);
+    Function *CVTAFunction = Intrinsic::getDeclaration(
+        M, Intrinsic::nvvm_ptr_global_to_gen, ParamTypes);
+    CVTA = Builder.CreateCall(CVTAFunction, GV, "cvta");
+  }
+
+  return CVTA;
+}
+
+Value *GenericToNVVM::remapConstant(Module *M, Function *F, Constant *C,
+                                    IRBuilder<> &Builder) {
+  // If the constant C has been converted already in the given function  F, just
+  // return the converted value.
+  ConstantToValueMapTy::iterator CTII = ConstantToValueMap.find(C);
+  if (CTII != ConstantToValueMap.end()) {
+    return CTII->second;
+  }
+
+  Value *NewValue = C;
+  if (isa<GlobalVariable>(C)) {
+    // If the constant C is a global variable and is found in  GVMap, generate a
+    // set set of instructions that convert the clone of C with the global
+    // address space specifier to a generic pointer.
+    // The constant C cannot be used here, as it will be erased from the
+    // module eventually.  And the clone of C with the global address space
+    // specifier cannot be used here either, as it will affect the types of
+    // other instructions in the function.  Hence, this address space conversion
+    // is required.
+    GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(C));
+    if (I != GVMap.end()) {
+      NewValue = getOrInsertCVTA(M, F, I->second, Builder);
+    }
+  } else if (isa<ConstantVector>(C) || isa<ConstantArray>(C) ||
+             isa<ConstantStruct>(C)) {
+    // If any element in the constant vector or aggregate C is or uses a global
+    // variable in GVMap, the constant C needs to be reconstructed, using a set
+    // of instructions.
+    NewValue = remapConstantVectorOrConstantAggregate(M, F, C, Builder);
+  } else if (isa<ConstantExpr>(C)) {
+    // If any operand in the constant expression C is or uses a global variable
+    // in GVMap, the constant expression C needs to be reconstructed, using a
+    // set of instructions.
+    NewValue = remapConstantExpr(M, F, cast<ConstantExpr>(C), Builder);
+  }
+
+  ConstantToValueMap[C] = NewValue;
+  return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantVectorOrConstantAggregate(
+    Module *M, Function *F, Constant *C, IRBuilder<> &Builder) {
+  bool OperandChanged = false;
+  SmallVector<Value *, 4> NewOperands;
+  unsigned NumOperands = C->getNumOperands();
+
+  // Check if any element is or uses a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    Value *Operand = C->getOperand(i);
+    Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the elements has been modified, return C as it is.
+  if (!OperandChanged) {
+    return C;
+  }
+
+  // If any of the elements has been  modified, construct the equivalent
+  // vector or aggregate value with a set instructions and the converted
+  // elements.
+  Value *NewValue = UndefValue::get(C->getType());
+  if (isa<ConstantVector>(C)) {
+    for (unsigned i = 0; i < NumOperands; ++i) {
+      Value *Idx = ConstantInt::get(Type::getInt32Ty(M->getContext()), i);
+      NewValue = Builder.CreateInsertElement(NewValue, NewOperands[i], Idx);
+    }
+  } else {
+    for (unsigned i = 0; i < NumOperands; ++i) {
+      NewValue =
+          Builder.CreateInsertValue(NewValue, NewOperands[i], makeArrayRef(i));
+    }
+  }
+
+  return NewValue;
+}
+
+Value *GenericToNVVM::remapConstantExpr(Module *M, Function *F, ConstantExpr *C,
+                                        IRBuilder<> &Builder) {
+  bool OperandChanged = false;
+  SmallVector<Value *, 4> NewOperands;
+  unsigned NumOperands = C->getNumOperands();
+
+  // Check if any operand is or uses a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    Value *Operand = C->getOperand(i);
+    Value *NewOperand = remapConstant(M, F, cast<Constant>(Operand), Builder);
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the operands has been modified, return C as it is.
+  if (!OperandChanged) {
+    return C;
+  }
+
+  // If any of the operands has been modified, construct the instruction with
+  // the converted operands.
+  unsigned Opcode = C->getOpcode();
+  switch (Opcode) {
+  case Instruction::ICmp:
+    // CompareConstantExpr (icmp)
+    return Builder.CreateICmp(CmpInst::Predicate(C->getPredicate()),
+                              NewOperands[0], NewOperands[1]);
+  case Instruction::FCmp:
+    // CompareConstantExpr (fcmp)
+    assert(false && "Address space conversion should have no effect "
+                    "on float point CompareConstantExpr (fcmp)!");
+    return C;
+  case Instruction::ExtractElement:
+    // ExtractElementConstantExpr
+    return Builder.CreateExtractElement(NewOperands[0], NewOperands[1]);
+  case Instruction::InsertElement:
+    // InsertElementConstantExpr
+    return Builder.CreateInsertElement(NewOperands[0], NewOperands[1],
+                                       NewOperands[2]);
+  case Instruction::ShuffleVector:
+    // ShuffleVector
+    return Builder.CreateShuffleVector(NewOperands[0], NewOperands[1],
+                                       NewOperands[2]);
+  case Instruction::ExtractValue:
+    // ExtractValueConstantExpr
+    return Builder.CreateExtractValue(NewOperands[0], C->getIndices());
+  case Instruction::InsertValue:
+    // InsertValueConstantExpr
+    return Builder.CreateInsertValue(NewOperands[0], NewOperands[1],
+                                     C->getIndices());
+  case Instruction::GetElementPtr:
+    // GetElementPtrConstantExpr
+    return cast<GEPOperator>(C)->isInBounds()
+               ? Builder.CreateGEP(
+                     NewOperands[0],
+                     makeArrayRef(&NewOperands[1], NumOperands - 1))
+               : Builder.CreateInBoundsGEP(
+                     NewOperands[0],
+                     makeArrayRef(&NewOperands[1], NumOperands - 1));
+  case Instruction::Select:
+    // SelectConstantExpr
+    return Builder.CreateSelect(NewOperands[0], NewOperands[1], NewOperands[2]);
+  default:
+    // BinaryConstantExpr
+    if (Instruction::isBinaryOp(Opcode)) {
+      return Builder.CreateBinOp(Instruction::BinaryOps(C->getOpcode()),
+                                 NewOperands[0], NewOperands[1]);
+    }
+    // UnaryConstantExpr
+    if (Instruction::isCast(Opcode)) {
+      return Builder.CreateCast(Instruction::CastOps(C->getOpcode()),
+                                NewOperands[0], C->getType());
+    }
+    assert(false && "GenericToNVVM encountered an unsupported ConstantExpr");
+    return C;
+  }
+}
+
+void GenericToNVVM::remapNamedMDNode(Module *M, NamedMDNode *N) {
+
+  bool OperandChanged = false;
+  SmallVector<MDNode *, 16> NewOperands;
+  unsigned NumOperands = N->getNumOperands();
+
+  // Check if any operand is or contains a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    MDNode *Operand = N->getOperand(i);
+    MDNode *NewOperand = remapMDNode(M, Operand);
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the operands has been modified, return immediately.
+  if (!OperandChanged) {
+    return;
+  }
+
+  // Replace the old operands with the new operands.
+  N->dropAllReferences();
+  for (SmallVector<MDNode *, 16>::iterator I = NewOperands.begin(),
+                                           E = NewOperands.end();
+       I != E; ++I) {
+    N->addOperand(*I);
+  }
+}
+
+MDNode *GenericToNVVM::remapMDNode(Module *M, MDNode *N) {
+
+  bool OperandChanged = false;
+  SmallVector<Value *, 8> NewOperands;
+  unsigned NumOperands = N->getNumOperands();
+
+  // Check if any operand is or contains a global variable in  GVMap, and thus
+  // converted to another value.
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    Value *Operand = N->getOperand(i);
+    Value *NewOperand = Operand;
+    if (Operand) {
+      if (isa<GlobalVariable>(Operand)) {
+        GVMapTy::iterator I = GVMap.find(cast<GlobalVariable>(Operand));
+        if (I != GVMap.end()) {
+          NewOperand = I->second;
+          if (++i < NumOperands) {
+            NewOperands.push_back(NewOperand);
+            // Address space of the global variable follows the global variable
+            // in the global variable debug info (see createGlobalVariable in
+            // lib/Analysis/DIBuilder.cpp).
+            NewOperand =
+                ConstantInt::get(Type::getInt32Ty(M->getContext()),
+                                 I->second->getType()->getAddressSpace());
+          }
+        }
+      } else if (isa<MDNode>(Operand)) {
+        NewOperand = remapMDNode(M, cast<MDNode>(Operand));
+      }
+    }
+    OperandChanged |= Operand != NewOperand;
+    NewOperands.push_back(NewOperand);
+  }
+
+  // If none of the operands has been modified, return N as it is.
+  if (!OperandChanged) {
+    return N;
+  }
+
+  // If any of the operands has been modified, create a new MDNode with the new
+  // operands.
+  return MDNode::get(M->getContext(), makeArrayRef(NewOperands));
+}
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 0f4c8dbce5..d4378c2322 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -42,6 +42,11 @@ static cl::opt<int> UsePrecDivF32(
              " IEEE Compliant F32 div.rnd if avaiable."),
     cl::init(2));
 
+static cl::opt<bool>
+UsePrecSqrtF32("nvptx-prec-sqrtf32",
+          cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
+          cl::init(true));
+
 /// createNVPTXISelDag - This pass converts a legalized DAG into a
 /// NVPTX-specific DAG, ready for instruction scheduling.
 FunctionPass *llvm::createNVPTXISelDag(NVPTXTargetMachine &TM,
@@ -74,6 +79,8 @@ NVPTXDAGToDAGISel::NVPTXDAGToDAGISel(NVPTXTargetMachine &tm,
 
   // Decide how to translate f32 div
   do_DIVF32_PREC = UsePrecDivF32;
+  // Decide how to translate f32 sqrt
+  do_SQRTF32_PREC = UsePrecSqrtF32;
   // sm less than sm_20 does not support div.rnd. Use div.full.
   if (do_DIVF32_PREC == 2 && !Subtarget.reqPTX20())
     do_DIVF32_PREC = 1;
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 70e8e46429..ed16d4450b 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -41,6 +41,10 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
   //    Otherwise, use div.full
   int do_DIVF32_PREC;
 
+  // If true, generate sqrt.rn, else generate sqrt.approx. If FTZ
+  // is true, then generate the corresponding FTZ version.
+  bool do_SQRTF32_PREC;
+
   // If true, add .ftz to f32 instructions.
   // This is only meaningful for sm_20 and later, as the default
   // is not ftz.
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index f43abe283b..da6dd39b93 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -75,6 +75,9 @@ def allowFMA_ftz : Predicate<"(allowFMA && UseF32FTZ)">;
 def do_DIVF32_APPROX : Predicate<"do_DIVF32_PREC==0">;
 def do_DIVF32_FULL : Predicate<"do_DIVF32_PREC==1">;
 
+def do_SQRTF32_APPROX : Predicate<"do_SQRTF32_PREC==0">;
+def do_SQRTF32_RN : Predicate<"do_SQRTF32_PREC==1">;
+
 def hasHWROT32 : Predicate<"Subtarget.hasHWROT32()">;
 
 def true : Predicate<"1">;
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 49e2568dfa..24037cafef 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -512,6 +512,16 @@ def INT_NVVM_SQRT_RM_D : F_MATH_1<"sqrt.rm.f64 \t$dst, $src0;", Float64Regs,
 def INT_NVVM_SQRT_RP_D : F_MATH_1<"sqrt.rp.f64 \t$dst, $src0;", Float64Regs,
   Float64Regs, int_nvvm_sqrt_rp_d>;
 
+// nvvm_sqrt intrinsic
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_RN_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ, do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_RN_F Float32Regs:$a)>, Requires<[do_SQRTF32_RN]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_APPROX_FTZ_F Float32Regs:$a)>, Requires<[doF32FTZ]>;
+def : Pat<(int_nvvm_sqrt_f Float32Regs:$a),
+          (INT_NVVM_SQRT_APPROX_F Float32Regs:$a)>;
+
 //
 // Rsqrt
 //
@@ -1510,38 +1520,12 @@ multiclass G_TO_NG<string Str, Intrinsic Intrin> {
 defm cvta_local  : NG_TO_G<"local", int_nvvm_ptr_local_to_gen>;
 defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen>;
 defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen>;
+defm cvta_const  : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen>;
 
 defm cvta_to_local   : G_TO_NG<"local", int_nvvm_ptr_gen_to_local>;
 defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared>;
 defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global>;
-
-def cvta_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
-               "mov.u32 \t$result, $src;",
-     [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen Int32Regs:$src))]>;
-def cvta_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
-               "mov.u64 \t$result, $src;",
-     [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen Int64Regs:$src))]>;
-
-
-
-// @TODO: Revisit this.  There is a type
-// contradiction between iPTRAny and iPTR for the def.
-/*def cvta_const_addr : NVPTXInst<(outs Int32Regs:$result), (ins imemAny:$src),
-               "mov.u32 \t$result, $src;",
-     [(set Int32Regs:$result, (int_nvvm_ptr_constant_to_gen
-     (Wrapper tglobaladdr:$src)))]>;
-def cvta_const_addr_64 : NVPTXInst<(outs Int64Regs:$result), (ins imemAny:$src),
-               "mov.u64 \t$result, $src;",
-     [(set Int64Regs:$result, (int_nvvm_ptr_constant_to_gen
-     (Wrapper tglobaladdr:$src)))]>;*/
-
-
-def cvta_to_const : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
-            "mov.u32 \t$result, $src;",
-     [(set Int32Regs:$result, (int_nvvm_ptr_gen_to_constant Int32Regs:$src))]>;
-def cvta_to_const_64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
-            "mov.u64 \t$result, $src;",
-     [(set Int64Regs:$result, (int_nvvm_ptr_gen_to_constant Int64Regs:$src))]>;
+defm cvta_to_const  : G_TO_NG<"const", int_nvvm_ptr_gen_to_constant>;
 
 
 // nvvm.ptr.gen.to.param
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 67ca6b58e5..1ae2a7cc86 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -49,6 +49,7 @@ using namespace llvm;
 
 namespace llvm {
 void initializeNVVMReflectPass(PassRegistry&);
+void initializeGenericToNVVMPass(PassRegistry&);
 }
 
 extern "C" void LLVMInitializeNVPTXTarget() {
@@ -62,6 +63,7 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   // FIXME: This pass is really intended to be invoked during IR optimization,
   // but it's very NVPTX-specific.
   initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+  initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());
 }
 
 NVPTXTargetMachine::NVPTXTargetMachine(
@@ -100,6 +102,7 @@ public:
     return getTM<NVPTXTargetMachine>();
   }
 
+  virtual void addIRPasses();
   virtual bool addInstSelector();
   virtual bool addPreRegAlloc();
 };
@@ -110,6 +113,11 @@ TargetPassConfig *NVPTXTargetMachine::createPassConfig(PassManagerBase &PM) {
   return PassConfig;
 }
 
+void NVPTXPassConfig::addIRPasses() {
+  TargetPassConfig::addIRPasses();
+  addPass(createGenericToNVVMPass());
+}
+
 bool NVPTXPassConfig::addInstSelector() {
   addPass(createLowerAggrCopies());
   addPass(createSplitBBatBarPass());
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 0ad62ce39b..3cc324b85e 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -14,6 +14,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "NVPTX.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
@@ -40,7 +41,7 @@ using namespace llvm;
 namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
 
 namespace {
-class LLVM_LIBRARY_VISIBILITY NVVMReflect : public ModulePass {
+class NVVMReflect : public ModulePass {
 private:
   StringMap<int> VarMap;
   typedef DenseMap<std::string, int>::iterator VarMapIter;
@@ -48,9 +49,18 @@ private:
 
 public:
   static char ID;
-  NVVMReflect() : ModulePass(ID) {
+  NVVMReflect() : ModulePass(ID), ReflectFunction(0) {
+    initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
     VarMap.clear();
-    ReflectFunction = 0;
+  }
+
+  NVVMReflect(const StringMap<int> &Mapping)
+  : ModulePass(ID), ReflectFunction(0) {
+    initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
+    for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end();
+         I != E; ++I) {
+      VarMap[(*I).getKey()] = (*I).getValue();
+    }
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); }
@@ -60,6 +70,14 @@ public:
 };
 }
 
+ModulePass *llvm::createNVVMReflectPass() {
+  return new NVVMReflect();
+}
+
+ModulePass *llvm::createNVVMReflectPass(const StringMap<int>& Mapping) {
+  return new NVVMReflect(Mapping);
+}
+
 static cl::opt<bool>
 NVVMReflectEnabled("nvvm-reflect-enable", cl::init(true),
                    cl::desc("NVVM reflection, enabled by default"));
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index bd1c378681..3e608ca8f6 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -112,15 +112,21 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
       unsigned MBBStartOffset = 0;
       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
            I != E; ++I) {
-        if (I->getOpcode() != PPC::BCC || I->getOperand(2).isImm()) {
+        MachineBasicBlock *Dest = 0;
+        if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm())
+          Dest = I->getOperand(2).getMBB();
+        else if ((I->getOpcode() == PPC::BDNZ8 || I->getOpcode() == PPC::BDNZ ||
+                  I->getOpcode() == PPC::BDZ8  || I->getOpcode() == PPC::BDZ) &&
+                 !I->getOperand(0).isImm())
+          Dest = I->getOperand(0).getMBB();
+
+        if (!Dest) {
           MBBStartOffset += TII->GetInstSizeInBytes(I);
           continue;
         }
         
         // Determine the offset from the current branch to the destination
         // block.
-        MachineBasicBlock *Dest = I->getOperand(2).getMBB();
-        
         int BranchSize;
         if (Dest->getNumber() <= MBB.getNumber()) {
           // If this is a backwards branch, the delta is the offset from the
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index b44d2482d5..d2620b2877 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -284,6 +284,17 @@ class SHA256MaPattern <Instruction BFI_INT, Instruction XOR> : Pat <
   (BFI_INT (XOR i32:$x, i32:$y), i32:$z, i32:$y)
 >;
 
+// Bitfield extract patterns
+
+def legalshift32 : ImmLeaf <i32, [{return Imm >=0 && Imm < 32;}]>;
+def bfemask : PatLeaf <(imm), [{return isMask_32(N->getZExtValue());}],
+                            SDNodeXForm<imm, [{ return CurDAG->getTargetConstant(CountTrailingOnes_32(N->getZExtValue()), MVT::i32);}]>>;
+
+class BFEPattern <Instruction BFE> : Pat <
+  (and (srl i32:$x, legalshift32:$y), bfemask:$z),
+  (BFE $x, $y, $z)
+>;
+
 include "R600Instructions.td"
 
 include "SIInstrInfo.td"
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index 0ec67ce13b..31fbf32d0c 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -115,7 +115,6 @@ AMDGPUPassConfig::addPreISel() {
 }
 
 bool AMDGPUPassConfig::addInstSelector() {
-  addPass(createAMDGPUPeepholeOpt(*TM));
   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
 
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
diff --git a/lib/Target/R600/AMDILDeviceInfo.cpp b/lib/Target/R600/AMDILDeviceInfo.cpp
index 178795936a..126514b976 100644
--- a/lib/Target/R600/AMDILDeviceInfo.cpp
+++ b/lib/Target/R600/AMDILDeviceInfo.cpp
@@ -81,7 +81,8 @@ AMDGPUDevice* getDeviceFromName(const std::string &deviceName,
     return new AMDGPUNIDevice(ptr);
   } else if (deviceName == "SI" ||
              deviceName == "tahiti" || deviceName == "pitcairn" ||
-             deviceName == "verde"  || deviceName == "oland") {
+             deviceName == "verde"  || deviceName == "oland" ||
+	     deviceName == "hainan") {
     return new AMDGPUSIDevice(ptr);
   } else {
 #if DEBUG
diff --git a/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/lib/Target/R600/AMDILPeepholeOptimizer.cpp
deleted file mode 100644
index 3a28038666..0000000000
--- a/lib/Target/R600/AMDILPeepholeOptimizer.cpp
+++ /dev/null
@@ -1,1215 +0,0 @@
-//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-/// \file
-//==-----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "PeepholeOpt"
-#ifdef DEBUG
-#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
-#else
-#define DEBUGME 0
-#endif
-
-#include "AMDILDevices.h"
-#include "AMDGPUInstrInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-
-#include <sstream>
-
-#if 0
-STATISTIC(PointerAssignments, "Number of dynamic pointer "
-    "assigments discovered");
-STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
-#endif
-
-using namespace llvm;
-// The Peephole optimization pass is used to do simple last minute optimizations
-// that are required for correct code or to remove redundant functions
-namespace {
-
-class OpaqueType;
-
-class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
-public:
-  TargetMachine &TM;
-  static char ID;
-  AMDGPUPeepholeOpt(TargetMachine &tm);
-  ~AMDGPUPeepholeOpt();
-  const char *getPassName() const;
-  bool runOnFunction(Function &F);
-  bool doInitialization(Module &M);
-  bool doFinalization(Module &M);
-  void getAnalysisUsage(AnalysisUsage &AU) const;
-protected:
-private:
-  // Function to initiate all of the instruction level optimizations.
-  bool instLevelOptimizations(BasicBlock::iterator *inst);
-  // Quick check to see if we need to dump all of the pointers into the
-  // arena. If this is correct, then we set all pointers to exist in arena. This
-  // is a workaround for aliasing of pointers in a struct/union.
-  bool dumpAllIntoArena(Function &F);
-  // Because I don't want to invalidate any pointers while in the
-  // safeNestedForEachFunction. I push atomic conversions to a vector and handle
-  // it later. This function does the conversions if required.
-  void doAtomicConversionIfNeeded(Function &F);
-  // Because __amdil_is_constant cannot be properly evaluated if
-  // optimizations are disabled, the call's are placed in a vector
-  // and evaluated after the __amdil_image* functions are evaluated
-  // which should allow the __amdil_is_constant function to be
-  // evaluated correctly.
-  void doIsConstCallConversionIfNeeded();
-  bool mChanged;
-  bool mDebug;
-  bool mConvertAtomics;
-  CodeGenOpt::Level optLevel;
-  // Run a series of tests to see if we can optimize a CALL instruction.
-  bool optimizeCallInst(BasicBlock::iterator *bbb);
-  // A peephole optimization to optimize bit extract sequences.
-  bool optimizeBitExtract(Instruction *inst);
-  // A peephole optimization to optimize bit insert sequences.
-  bool optimizeBitInsert(Instruction *inst);
-  bool setupBitInsert(Instruction *base, 
-                      Instruction *&src, 
-                      Constant *&mask, 
-                      Constant *&shift);
-  // Expand the bit field insert instruction on versions of OpenCL that
-  // don't support it.
-  bool expandBFI(CallInst *CI);
-  // Expand the bit field mask instruction on version of OpenCL that 
-  // don't support it.
-  bool expandBFM(CallInst *CI);
-  // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
-  // this case we need to expand them. These functions check for 24bit functions
-  // and then expand.
-  bool isSigned24BitOps(CallInst *CI);
-  void expandSigned24BitOps(CallInst *CI);
-  // One optimization that can occur is that if the required workgroup size is
-  // specified then the result of get_local_size is known at compile time and
-  // can be returned accordingly.
-  bool isRWGLocalOpt(CallInst *CI);
-  // On northern island cards, the division is slightly less accurate than on
-  // previous generations, so we need to utilize a more accurate division. So we
-  // can translate the accurate divide to a normal divide on all other cards.
-  bool convertAccurateDivide(CallInst *CI);
-  void expandAccurateDivide(CallInst *CI);
-  // If the alignment is set incorrectly, it can produce really inefficient
-  // code. This checks for this scenario and fixes it if possible.
-  bool correctMisalignedMemOp(Instruction *inst);
-
-  // If we are in no opt mode, then we need to make sure that
-  // local samplers are properly propagated as constant propagation 
-  // doesn't occur and we need to know the value of kernel defined
-  // samplers at compile time.
-  bool propagateSamplerInst(CallInst *CI);
-
-  // Helper functions
-
-  // Group of functions that recursively calculate the size of a structure based
-  // on it's sub-types.
-  size_t getTypeSize(Type * const T, bool dereferencePtr = false);
-  size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
-  size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
-  size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
-  size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
-  size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
-  size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
-  size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
-
-  LLVMContext *mCTX;
-  Function *mF;
-  const AMDGPUSubtarget *mSTM;
-  SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
-  SmallVector<CallInst *, 16> isConstVec;
-}; // class AMDGPUPeepholeOpt
-  char AMDGPUPeepholeOpt::ID = 0;
-
-// A template function that has two levels of looping before calling the
-// function with a pointer to the current iterator.
-template<class InputIterator, class SecondIterator, class Function>
-Function safeNestedForEach(InputIterator First, InputIterator Last,
-                              SecondIterator S, Function F) {
-  for ( ; First != Last; ++First) {
-    SecondIterator sf, sl;
-    for (sf = First->begin(), sl = First->end();
-         sf != sl; )  {
-      if (!F(&sf)) {
-        ++sf;
-      } 
-    }
-  }
-  return F;
-}
-
-} // anonymous namespace
-
-namespace llvm {
-  FunctionPass *
-  createAMDGPUPeepholeOpt(TargetMachine &tm) {
-    return new AMDGPUPeepholeOpt(tm);
-  }
-} // llvm namespace
-
-AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
-  : FunctionPass(ID), TM(tm)  {
-  mDebug = DEBUGME;
-  optLevel = TM.getOptLevel();
-
-}
-
-AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()  {
-}
-
-const char *
-AMDGPUPeepholeOpt::getPassName() const  {
-  return "AMDGPU PeepHole Optimization Pass";
-}
-
-bool 
-containsPointerType(Type *Ty)  {
-  if (!Ty) {
-    return false;
-  }
-  switch(Ty->getTypeID()) {
-  default:
-    return false;
-  case Type::StructTyID: {
-    const StructType *ST = dyn_cast<StructType>(Ty);
-    for (StructType::element_iterator stb = ST->element_begin(),
-           ste = ST->element_end(); stb != ste; ++stb) {
-      if (!containsPointerType(*stb)) {
-        continue;
-      }
-      return true;
-    }
-    break;
-  }
-  case Type::VectorTyID:
-  case Type::ArrayTyID:
-    return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
-  case Type::PointerTyID:
-    return true;
-  };
-  return false;
-}
-
-bool 
-AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)  {
-  bool dumpAll = false;
-  for (Function::const_arg_iterator cab = F.arg_begin(),
-       cae = F.arg_end(); cab != cae; ++cab) {
-    const Argument *arg = cab;
-    const PointerType *PT = dyn_cast<PointerType>(arg->getType());
-    if (!PT) {
-      continue;
-    }
-    Type *DereferencedType = PT->getElementType();
-    if (!dyn_cast<StructType>(DereferencedType) 
-        ) {
-      continue;
-    }
-    if (!containsPointerType(DereferencedType)) {
-      continue;
-    }
-    // FIXME: Because a pointer inside of a struct/union may be aliased to
-    // another pointer we need to take the conservative approach and place all
-    // pointers into the arena until more advanced detection is implemented.
-    dumpAll = true;
-  }
-  return dumpAll;
-}
-void
-AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
-  if (isConstVec.empty()) {
-    return;
-  }
-  for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
-    CallInst *CI = isConstVec[x];
-    Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
-    Type *aType = Type::getInt32Ty(*mCTX);
-    Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
-      : ConstantInt::get(aType, 0);
-    CI->replaceAllUsesWith(Val);
-    CI->eraseFromParent();
-  }
-  isConstVec.clear();
-}
-void 
-AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)  {
-  // Don't do anything if we don't have any atomic operations.
-  if (atomicFuncs.empty()) {
-    return;
-  }
-  // Change the function name for the atomic if it is required
-  uint32_t size = atomicFuncs.size();
-  for (uint32_t x = 0; x < size; ++x) {
-    atomicFuncs[x].first->setOperand(
-        atomicFuncs[x].first->getNumOperands()-1, 
-        atomicFuncs[x].second);
-
-  }
-  mChanged = true;
-  if (mConvertAtomics) {
-    return;
-  }
-}
-
-bool 
-AMDGPUPeepholeOpt::runOnFunction(Function &MF)  {
-  mChanged = false;
-  mF = &MF;
-  mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
-  if (mDebug) {
-    MF.dump();
-  }
-  mCTX = &MF.getType()->getContext();
-  mConvertAtomics = true;
-  safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
-     std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
-                  this));
-
-  doAtomicConversionIfNeeded(MF);
-  doIsConstCallConversionIfNeeded();
-
-  if (mDebug) {
-    MF.dump();
-  }
-  return mChanged;
-}
-
-bool 
-AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)  {
-  Instruction *inst = (*bbb);
-  CallInst *CI = dyn_cast<CallInst>(inst);
-  if (!CI) {
-    return false;
-  }
-  if (isSigned24BitOps(CI)) {
-    expandSigned24BitOps(CI);
-    ++(*bbb);
-    CI->eraseFromParent();
-    return true;
-  }
-  if (propagateSamplerInst(CI)) {
-    return false;
-  }
-  if (expandBFI(CI) || expandBFM(CI)) {
-    ++(*bbb);
-    CI->eraseFromParent();
-    return true;
-  }
-  if (convertAccurateDivide(CI)) {
-    expandAccurateDivide(CI);
-    ++(*bbb);
-    CI->eraseFromParent();
-    return true;
-  }
-
-  StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
-  if (calleeName.startswith("__amdil_is_constant")) {
-    // If we do not have optimizations, then this
-    // cannot be properly evaluated, so we add the
-    // call instruction to a vector and process
-    // them at the end of processing after the
-    // samplers have been correctly handled.
-    if (optLevel == CodeGenOpt::None) {
-      isConstVec.push_back(CI);
-      return false;
-    } else {
-      Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
-      Type *aType = Type::getInt32Ty(*mCTX);
-      Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
-        : ConstantInt::get(aType, 0);
-      CI->replaceAllUsesWith(Val);
-      ++(*bbb);
-      CI->eraseFromParent();
-      return true;
-    }
-  }
-
-  if (calleeName.equals("__amdil_is_asic_id_i32")) {
-    ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
-    Type *aType = Type::getInt32Ty(*mCTX);
-    Value *Val = CV;
-    if (Val) {
-      Val = ConstantInt::get(aType, 
-          mSTM->device()->getDeviceFlag() & CV->getZExtValue());
-    } else {
-      Val = ConstantInt::get(aType, 0);
-    }
-    CI->replaceAllUsesWith(Val);
-    ++(*bbb);
-    CI->eraseFromParent();
-    return true;
-  }
-  Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
-  if (!F) {
-    return false;
-  } 
-  if (F->getName().startswith("__atom") && !CI->getNumUses() 
-      && F->getName().find("_xchg") == StringRef::npos) {
-    std::string buffer(F->getName().str() + "_noret");
-    F = dyn_cast<Function>(
-          F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
-    atomicFuncs.push_back(std::make_pair(CI, F));
-  }
-  
-  if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
-      && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
-    return false;
-  }
-  if (!mConvertAtomics) {
-    return false;
-  }
-  StringRef name = F->getName();
-  if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
-    mConvertAtomics = false;
-  }
-  return false;
-}
-
-bool
-AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, 
-    Instruction *&src, 
-    Constant *&mask, 
-    Constant *&shift) {
-  if (!base) {
-    if (mDebug) {
-      dbgs() << "Null pointer passed into function.\n";
-    }
-    return false;
-  }
-  bool andOp = false;
-  if (base->getOpcode() == Instruction::Shl) {
-    shift = dyn_cast<Constant>(base->getOperand(1));
-  } else if (base->getOpcode() == Instruction::And) {
-    mask = dyn_cast<Constant>(base->getOperand(1));
-    andOp = true;
-  } else {
-    if (mDebug) {
-      dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
-    }
-    // If the base is neither a Shl or a And, we don't fit any of the patterns above.
-    return false;
-  }
-  src = dyn_cast<Instruction>(base->getOperand(0));
-  if (!src) {
-    if (mDebug) {
-      dbgs() << "Failed setup since the base operand is not an instruction!\n";
-    }
-    return false;
-  }
-  // If we find an 'and' operation, then we don't need to
-  // find the next operation as we already know the
-  // bits that are valid at this point.
-  if (andOp) {
-    return true;
-  }
-  if (src->getOpcode() == Instruction::Shl && !shift) {
-    shift = dyn_cast<Constant>(src->getOperand(1));
-    src = dyn_cast<Instruction>(src->getOperand(0));
-  } else if (src->getOpcode() == Instruction::And && !mask) {
-    mask = dyn_cast<Constant>(src->getOperand(1));
-  }
-  if (!mask && !shift) {
-    if (mDebug) {
-      dbgs() << "Failed setup since both mask and shift are NULL!\n";
-    }
-    // Did not find a constant mask or a shift.
-    return false;
-  }
-  return true;
-}
-bool
-AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)  {
-  if (!inst) {
-    return false;
-  }
-  if (!inst->isBinaryOp()) {
-    return false;
-  }
-  if (inst->getOpcode() != Instruction::Or) {
-    return false;
-  }
-  if (optLevel == CodeGenOpt::None) {
-    return false;
-  }
-  // We want to do an optimization on a sequence of ops that in the end equals a
-  // single ISA instruction.
-  // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
-  // Some simplified versions of this pattern are as follows:
-  // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
-  // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
-  // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
-  // (A & B) | (D << F) when (1 << F) >= B
-  // (A << C) | (D & E) when (1 << C) >= E
-  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
-    // The HD4XXX hardware doesn't support the ubit_insert instruction.
-    return false;
-  }
-  Type *aType = inst->getType();
-  bool isVector = aType->isVectorTy();
-  int numEle = 1;
-  // This optimization only works on 32bit integers.
-  if (aType->getScalarType()
-      != Type::getInt32Ty(inst->getContext())) {
-    return false;
-  }
-  if (isVector) {
-    const VectorType *VT = dyn_cast<VectorType>(aType);
-    numEle = VT->getNumElements();
-    // We currently cannot support more than 4 elements in a intrinsic and we
-    // cannot support Vec3 types.
-    if (numEle > 4 || numEle == 3) {
-      return false;
-    }
-  }
-  // TODO: Handle vectors.
-  if (isVector) {
-    if (mDebug) {
-      dbgs() << "!!! Vectors are not supported yet!\n";
-    }
-    return false;
-  }
-  Instruction *LHSSrc = NULL, *RHSSrc = NULL;
-  Constant *LHSMask = NULL, *RHSMask = NULL;
-  Constant *LHSShift = NULL, *RHSShift = NULL;
-  Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
-  Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
-  if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
-    if (mDebug) {
-      dbgs() << "Found an OR Operation that failed setup!\n";
-      inst->dump();
-      if (LHS) { LHS->dump(); }
-      if (LHSSrc) { LHSSrc->dump(); }
-      if (LHSMask) { LHSMask->dump(); }
-      if (LHSShift) { LHSShift->dump(); }
-    }
-    // There was an issue with the setup for BitInsert.
-    return false;
-  }
-  if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
-    if (mDebug) {
-      dbgs() << "Found an OR Operation that failed setup!\n";
-      inst->dump();
-      if (RHS) { RHS->dump(); }
-      if (RHSSrc) { RHSSrc->dump(); }
-      if (RHSMask) { RHSMask->dump(); }
-      if (RHSShift) { RHSShift->dump(); }
-    }
-    // There was an issue with the setup for BitInsert.
-    return false;
-  }
-  if (mDebug) {
-    dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
-    dbgs() << "Op:        "; inst->dump();
-    dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
-    dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
-  }
-  Constant *offset = NULL;
-  Constant *width = NULL;
-  uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
-  uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
-  uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
-  uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
-  lhsMaskVal = (LHSMask 
-      ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
-  rhsMaskVal = (RHSMask 
-      ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
-  lhsShiftVal = (LHSShift 
-      ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
-  rhsShiftVal = (RHSShift 
-      ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
-  lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
-  rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
-  lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
-  rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
-  // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
-  if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
-    return false;
-  }
-  if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
-    offset = ConstantInt::get(aType, lhsMaskOffset, false);
-    width = ConstantInt::get(aType, lhsMaskWidth, false);
-    RHSSrc = RHS;
-    if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
-      return false;
-    }
-    if (!LHSShift) {
-      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
-          "MaskShr", LHS);
-    } else if (lhsShiftVal != lhsMaskOffset) {
-      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
-          "MaskShr", LHS);
-    }
-    if (mDebug) {
-      dbgs() << "Optimizing LHS!\n";
-    }
-  } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
-    offset = ConstantInt::get(aType, rhsMaskOffset, false);
-    width = ConstantInt::get(aType, rhsMaskWidth, false);
-    LHSSrc = RHSSrc;
-    RHSSrc = LHS;
-    if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
-      return false;
-    }
-    if (!RHSShift) {
-      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
-          "MaskShr", RHS);
-    } else if (rhsShiftVal != rhsMaskOffset) {
-      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
-          "MaskShr", RHS);
-    }
-    if (mDebug) {
-      dbgs() << "Optimizing RHS!\n";
-    }
-  } else {
-    if (mDebug) {
-      dbgs() << "Failed constraint 3!\n";
-    }
-    return false;
-  }
-  if (mDebug) {
-    dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
-    dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
-    dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
-    dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
-  }
-  if (!offset || !width) {
-    if (mDebug) {
-      dbgs() << "Either width or offset are NULL, failed detection!\n";
-    }
-    return false;
-  }
-  // Lets create the function signature.
-  std::vector<Type *> callTypes;
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
-  std::string name = "__amdil_ubit_insert";
-  if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
-  Function *Func = 
-    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
-        getOrInsertFunction(StringRef(name), funcType));
-  Value *Operands[4] = {
-    width,
-    offset,
-    LHSSrc,
-    RHSSrc
-  };
-  CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
-  if (mDebug) {
-    dbgs() << "Old Inst: ";
-    inst->dump();
-    dbgs() << "New Inst: ";
-    CI->dump();
-    dbgs() << "\n\n";
-  }
-  CI->insertBefore(inst);
-  inst->replaceAllUsesWith(CI);
-  return true;
-}
-
-bool 
-AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)  {
-  if (!inst) {
-    return false;
-  }
-  if (!inst->isBinaryOp()) {
-    return false;
-  }
-  if (inst->getOpcode() != Instruction::And) {
-    return false;
-  }
-  if (optLevel == CodeGenOpt::None) {
-    return false;
-  }
-  // We want to do some simple optimizations on Shift right/And patterns. The
-  // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
-  // value smaller than 32 and C is a mask. If C is a constant value, then the
-  // following transformation can occur. For signed integers, it turns into the
-  // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
-  // integers, it turns into the function call dst =
-  // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
-  // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
-  // Evergreen hardware.
-  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
-    // This does not work on HD4XXX hardware.
-    return false;
-  }
-  Type *aType = inst->getType();
-  bool isVector = aType->isVectorTy();
-
-  // XXX Support vector types
-  if (isVector) {
-    return false;
-  }
-  int numEle = 1;
-  // This only works on 32bit integers
-  if (aType->getScalarType()
-      != Type::getInt32Ty(inst->getContext())) {
-    return false;
-  }
-  if (isVector) {
-    const VectorType *VT = dyn_cast<VectorType>(aType);
-    numEle = VT->getNumElements();
-    // We currently cannot support more than 4 elements in a intrinsic and we
-    // cannot support Vec3 types.
-    if (numEle > 4 || numEle == 3) {
-      return false;
-    }
-  }
-  BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
-  // If the first operand is not a shift instruction, then we can return as it
-  // doesn't match this pattern.
-  if (!ShiftInst || !ShiftInst->isShift()) {
-    return false;
-  }
-  // If we are a shift left, then we need don't match this pattern.
-  if (ShiftInst->getOpcode() == Instruction::Shl) {
-    return false;
-  }
-  bool isSigned = ShiftInst->isArithmeticShift();
-  Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
-  Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
-  // Lets make sure that the shift value and the and mask are constant integers.
-  if (!AndMask || !ShrVal) {
-    return false;
-  }
-  Constant *newMaskConst;
-  Constant *shiftValConst;
-  if (isVector) {
-    // Handle the vector case
-    std::vector<Constant *> maskVals;
-    std::vector<Constant *> shiftVals;
-    ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
-    ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
-    Type *scalarType = AndMaskVec->getType()->getScalarType();
-    assert(AndMaskVec->getNumOperands() ==
-           ShrValVec->getNumOperands() && "cannot have a "
-           "combination where the number of elements to a "
-           "shift and an and are different!");
-    for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
-      ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
-      ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
-      if (!AndCI || !ShiftIC) {
-        return false;
-      }
-      uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
-      if (!isMask_32(maskVal)) {
-        return false;
-      }
-      maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
-      uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
-      // If the mask or shiftval is greater than the bitcount, then break out.
-      if (maskVal >= 32 || shiftVal >= 32) {
-        return false;
-      }
-      // If the mask val is greater than the the number of original bits left
-      // then this optimization is invalid.
-      if (maskVal > (32 - shiftVal)) {
-        return false;
-      }
-      maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
-      shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
-    }
-    newMaskConst = ConstantVector::get(maskVals);
-    shiftValConst = ConstantVector::get(shiftVals);
-  } else {
-    // Handle the scalar case
-    uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
-    // This must be a mask value where all lower bits are set to 1 and then any
-    // bit higher is set to 0.
-    if (!isMask_32(maskVal)) {
-      return false;
-    }
-    maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
-    // Count the number of bits set in the mask, this is the width of the
-    // resulting bit set that is extracted from the source value.
-    uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
-    // If the mask or shift val is greater than the bitcount, then break out.
-    if (maskVal >= 32 || shiftVal >= 32) {
-      return false;
-    }
-    // If the mask val is greater than the the number of original bits left then
-    // this optimization is invalid.
-    if (maskVal > (32 - shiftVal)) {
-      return false;
-    }
-    newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
-    shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
-  }
-  // Lets create the function signature.
-  std::vector<Type *> callTypes;
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  callTypes.push_back(aType);
-  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
-  std::string name = "llvm.AMDGPU.bit.extract.u32";
-  if (isVector) {
-    name += ".v" + itostr(numEle) + "i32";
-  } else {
-    name += ".";
-  }
-  // Lets create the function.
-  Function *Func = 
-    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
-                       getOrInsertFunction(StringRef(name), funcType));
-  Value *Operands[3] = {
-    ShiftInst->getOperand(0),
-    shiftValConst,
-    newMaskConst
-  };
-  // Lets create the Call with the operands
-  CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
-  CI->setDoesNotAccessMemory();
-  CI->insertBefore(inst);
-  inst->replaceAllUsesWith(CI);
-  return true;
-}
-
-bool
-AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
-  if (!CI) {
-    return false;
-  }
-  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
-  if (!LHS->getName().startswith("__amdil_bfi")) {
-    return false;
-  }
-  Type* type = CI->getOperand(0)->getType();
-  Constant *negOneConst = NULL;
-  if (type->isVectorTy()) {
-    std::vector<Constant *> negOneVals;
-    negOneConst = ConstantInt::get(CI->getContext(), 
-        APInt(32, StringRef("-1"), 10));
-    for (size_t x = 0,
-        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
-      negOneVals.push_back(negOneConst);
-    }
-    negOneConst = ConstantVector::get(negOneVals);
-  } else {
-    negOneConst = ConstantInt::get(CI->getContext(), 
-        APInt(32, StringRef("-1"), 10));
-  }
-  // __amdil_bfi => (A & B) | (~A & C)
-  BinaryOperator *lhs = 
-    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
-        CI->getOperand(1), "bfi_and", CI);
-  BinaryOperator *rhs =
-    BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
-        "bfi_not", CI);
-  rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
-      "bfi_and", CI);
-  lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
-  CI->replaceAllUsesWith(lhs);
-  return true;
-}
-
-bool
-AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
-  if (!CI) {
-    return false;
-  }
-  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
-  if (!LHS->getName().startswith("__amdil_bfm")) {
-    return false;
-  }
-  // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
-  Constant *newMaskConst = NULL;
-  Constant *newShiftConst = NULL;
-  Type* type = CI->getOperand(0)->getType();
-  if (type->isVectorTy()) {
-    std::vector<Constant*> newMaskVals, newShiftVals;
-    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
-    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
-    for (size_t x = 0,
-        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
-      newMaskVals.push_back(newMaskConst);
-      newShiftVals.push_back(newShiftConst);
-    }
-    newMaskConst = ConstantVector::get(newMaskVals);
-    newShiftConst = ConstantVector::get(newShiftVals);
-  } else {
-    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
-    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
-  }
-  BinaryOperator *lhs =
-    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
-        newMaskConst, "bfm_mask", CI);
-  lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
-      lhs, "bfm_shl", CI);
-  lhs = BinaryOperator::Create(Instruction::Sub, lhs,
-      newShiftConst, "bfm_sub", CI);
-  BinaryOperator *rhs =
-    BinaryOperator::Create(Instruction::And, CI->getOperand(1),
-        newMaskConst, "bfm_mask", CI);
-  lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
-  CI->replaceAllUsesWith(lhs);
-  return true;
-}
-
-bool
-AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)  {
-  Instruction *inst = (*bbb);
-  if (optimizeCallInst(bbb)) {
-    return true;
-  }
-  if (optimizeBitExtract(inst)) {
-    return false;
-  }
-  if (optimizeBitInsert(inst)) {
-    return false;
-  }
-  if (correctMisalignedMemOp(inst)) {
-    return false;
-  }
-  return false;
-}
-bool
-AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
-  LoadInst *linst = dyn_cast<LoadInst>(inst);
-  StoreInst *sinst = dyn_cast<StoreInst>(inst);
-  unsigned alignment;
-  Type* Ty = inst->getType();
-  if (linst) {
-    alignment = linst->getAlignment();
-    Ty = inst->getType();
-  } else if (sinst) {
-    alignment = sinst->getAlignment();
-    Ty = sinst->getValueOperand()->getType();
-  } else {
-    return false;
-  }
-  unsigned size = getTypeSize(Ty);
-  if (size == alignment || size < alignment) {
-    return false;
-  }
-  if (!Ty->isStructTy()) {
-    return false;
-  }
-  if (alignment < 4) {
-    if (linst) {
-      linst->setAlignment(0);
-      return true;
-    } else if (sinst) {
-      sinst->setAlignment(0);
-      return true;
-    }
-  }
-  return false;
-}
-bool 
-AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)  {
-  if (!CI) {
-    return false;
-  }
-  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
-  std::string namePrefix = LHS->getName().substr(0, 14);
-  if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
-      && namePrefix != "__amdil__imul24_high") {
-    return false;
-  }
-  if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
-    return false;
-  }
-  return true;
-}
-
-void 
-AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)  {
-  assert(isSigned24BitOps(CI) && "Must be a "
-      "signed 24 bit operation to call this function!");
-  Value *LHS = CI->getOperand(CI->getNumOperands()-1);
-  // On 7XX and 8XX we do not have signed 24bit, so we need to
-  // expand it to the following:
-  // imul24 turns into 32bit imul
-  // imad24 turns into 32bit imad
-  // imul24_high turns into 32bit imulhigh
-  if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
-    Type *aType = CI->getOperand(0)->getType();
-    bool isVector = aType->isVectorTy();
-    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
-    std::vector<Type*> callTypes;
-    callTypes.push_back(CI->getOperand(0)->getType());
-    callTypes.push_back(CI->getOperand(1)->getType());
-    callTypes.push_back(CI->getOperand(2)->getType());
-    FunctionType *funcType =
-      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
-    std::string name = "__amdil_imad";
-    if (isVector) {
-      name += "_v" + itostr(numEle) + "i32";
-    } else {
-      name += "_i32";
-    }
-    Function *Func = dyn_cast<Function>(
-                       CI->getParent()->getParent()->getParent()->
-                       getOrInsertFunction(StringRef(name), funcType));
-    Value *Operands[3] = {
-      CI->getOperand(0),
-      CI->getOperand(1),
-      CI->getOperand(2)
-    };
-    CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
-    nCI->insertBefore(CI);
-    CI->replaceAllUsesWith(nCI);
-  } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
-    BinaryOperator *mulOp =
-      BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
-          CI->getOperand(1), "imul24", CI);
-    CI->replaceAllUsesWith(mulOp);
-  } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
-    Type *aType = CI->getOperand(0)->getType();
-
-    bool isVector = aType->isVectorTy();
-    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
-    std::vector<Type*> callTypes;
-    callTypes.push_back(CI->getOperand(0)->getType());
-    callTypes.push_back(CI->getOperand(1)->getType());
-    FunctionType *funcType =
-      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
-    std::string name = "__amdil_imul_high";
-    if (isVector) {
-      name += "_v" + itostr(numEle) + "i32";
-    } else {
-      name += "_i32";
-    }
-    Function *Func = dyn_cast<Function>(
-                       CI->getParent()->getParent()->getParent()->
-                       getOrInsertFunction(StringRef(name), funcType));
-    Value *Operands[2] = {
-      CI->getOperand(0),
-      CI->getOperand(1)
-    };
-    CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
-    nCI->insertBefore(CI);
-    CI->replaceAllUsesWith(nCI);
-  }
-}
-
-bool 
-AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)  {
-  return (CI != NULL
-          && CI->getOperand(CI->getNumOperands() - 1)->getName() 
-          == "__amdil_get_local_size_int");
-}
-
-bool 
-AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)  {
-  if (!CI) {
-    return false;
-  }
-  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
-      && (mSTM->getDeviceName() == "cayman")) {
-    return false;
-  }
-  return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) 
-      == "__amdil_improved_div";
-}
-
-void 
-AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)  {
-  assert(convertAccurateDivide(CI)
-         && "expanding accurate divide can only happen if it is expandable!");
-  BinaryOperator *divOp =
-    BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
-                           CI->getOperand(1), "fdiv32", CI);
-  CI->replaceAllUsesWith(divOp);
-}
-
-bool
-AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
-  if (optLevel != CodeGenOpt::None) {
-    return false;
-  }
-
-  if (!CI) {
-    return false;
-  }
-
-  unsigned funcNameIdx = 0;
-  funcNameIdx = CI->getNumOperands() - 1;
-  StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
-  if (calleeName != "__amdil_image2d_read_norm"
-   && calleeName != "__amdil_image2d_read_unnorm"
-   && calleeName != "__amdil_image3d_read_norm"
-   && calleeName != "__amdil_image3d_read_unnorm") {
-    return false;
-  }
-
-  unsigned samplerIdx = 2;
-  samplerIdx = 1;
-  Value *sampler = CI->getOperand(samplerIdx);
-  LoadInst *lInst = dyn_cast<LoadInst>(sampler);
-  if (!lInst) {
-    return false;
-  }
-
-  if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
-    return false;
-  }
-
-  GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
-  // If we are loading from what is not a global value, then we
-  // fail and return.
-  if (!gv) {
-    return false;
-  }
-
-  // If we don't have an initializer or we have an initializer and
-  // the initializer is not a 32bit integer, we fail.
-  if (!gv->hasInitializer() 
-      || !gv->getInitializer()->getType()->isIntegerTy(32)) {
-      return false;
-  }
-
-  // Now that we have the global variable initializer, lets replace
-  // all uses of the load instruction with the samplerVal and
-  // reparse the __amdil_is_constant() function.
-  Constant *samplerVal = gv->getInitializer();
-  lInst->replaceAllUsesWith(samplerVal);
-  return true;
-}
-
-bool 
-AMDGPUPeepholeOpt::doInitialization(Module &M)  {
-  return false;
-}
-
-bool 
-AMDGPUPeepholeOpt::doFinalization(Module &M)  {
-  return false;
-}
-
-void 
-AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const  {
-  AU.addRequired<MachineFunctionAnalysis>();
-  FunctionPass::getAnalysisUsage(AU);
-  AU.setPreservesAll();
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
-  size_t size = 0;
-  if (!T) {
-    return size;
-  }
-  switch (T->getTypeID()) {
-  case Type::X86_FP80TyID:
-  case Type::FP128TyID:
-  case Type::PPC_FP128TyID:
-  case Type::LabelTyID:
-    assert(0 && "These types are not supported by this backend");
-  default:
-  case Type::FloatTyID:
-  case Type::DoubleTyID:
-    size = T->getPrimitiveSizeInBits() >> 3;
-    break;
-  case Type::PointerTyID:
-    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
-    break;
-  case Type::IntegerTyID:
-    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
-    break;
-  case Type::StructTyID:
-    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
-    break;
-  case Type::ArrayTyID:
-    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
-    break;
-  case Type::FunctionTyID:
-    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
-    break;
-  case Type::VectorTyID:
-    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
-    break;
-  };
-  return size;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
-    bool dereferencePtr) {
-  size_t size = 0;
-  if (!ST) {
-    return size;
-  }
-  Type *curType;
-  StructType::element_iterator eib;
-  StructType::element_iterator eie;
-  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
-    curType = *eib;
-    size += getTypeSize(curType, dereferencePtr);
-  }
-  return size;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
-    bool dereferencePtr) {
-  return IT ? (IT->getBitWidth() >> 3) : 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
-    bool dereferencePtr) {
-    assert(0 && "Should not be able to calculate the size of an function type");
-    return 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
-    bool dereferencePtr) {
-  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
-                                    dereferencePtr) * AT->getNumElements())
-                     : 0);
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
-    bool dereferencePtr) {
-  return VT ? (VT->getBitWidth() >> 3) : 0;
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
-    bool dereferencePtr) {
-  if (!PT) {
-    return 0;
-  }
-  Type *CT = PT->getElementType();
-  if (CT->getTypeID() == Type::StructTyID &&
-      PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
-    return getTypeSize(dyn_cast<StructType>(CT));
-  } else if (dereferencePtr) {
-    size_t size = 0;
-    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
-      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
-    }
-    return size;
-  } else {
-    return 4;
-  }
-}
-
-size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
-    bool dereferencePtr) {
-  //assert(0 && "Should not be able to calculate the size of an opaque type");
-  return 4;
-}
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 2ad2047278..97f0a40c29 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -21,7 +21,6 @@ add_llvm_target(R600CodeGen
   AMDILISelDAGToDAG.cpp
   AMDILISelLowering.cpp
   AMDILNIDevice.cpp
-  AMDILPeepholeOptimizer.cpp
   AMDILSIDevice.cpp
   AMDGPUAsmPrinter.cpp
   AMDGPUFrameLowering.cpp
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 45d009c2a0..61d70bb342 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -78,7 +78,7 @@ static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII,
   if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) {
     return createSIMCCodeEmitter(MCII, MRI, STI, Ctx);
   } else {
-    return createR600MCCodeEmitter(MCII, MRI, STI, Ctx);
+    return createR600MCCodeEmitter(MCII, MRI, STI);
   }
 }
 
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
index 09d0d5b61c..abb032045b 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -33,8 +33,7 @@ extern Target TheAMDGPUTarget;
 
 MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
                                        const MCRegisterInfo &MRI,
-                                       const MCSubtargetInfo &STI,
-                                       MCContext &Ctx);
+                                       const MCSubtargetInfo &STI);
 
 MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
                                      const MCRegisterInfo &MRI,
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 271a974734..cb4cf0ce38 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -36,13 +36,12 @@ class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
   const MCInstrInfo &MCII;
   const MCRegisterInfo &MRI;
   const MCSubtargetInfo &STI;
-  MCContext &Ctx;
 
 public:
 
   R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
-                    const MCSubtargetInfo &sti, MCContext &ctx)
-    : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { }
+                    const MCSubtargetInfo &sti)
+    : MCII(mcii), MRI(mri), STI(sti) { }
 
   /// \brief Encode the instruction and write it to the OS.
   virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -99,9 +98,8 @@ enum TextureTypes {
 
 MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
                                            const MCRegisterInfo &MRI,
-                                           const MCSubtargetInfo &STI,
-                                           MCContext &Ctx) {
-  return new R600MCCodeEmitter(MCII, MRI, STI, Ctx);
+                                           const MCSubtargetInfo &STI) {
+  return new R600MCCodeEmitter(MCII, MRI, STI);
 }
 
 void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -181,6 +179,13 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     Emit((u_int32_t) 0, OS);
   } else {
     uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
+    if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
+       ((Desc.TSFlags & R600_InstFlag::OP1) ||
+         Desc.TSFlags & R600_InstFlag::OP2)) {
+      uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
+      Inst &= ~(0x3FFULL << 39);
+      Inst |= ISAOpCode << 1;
+    }
     Emit(Inst, OS);
   }
 }
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index 5ee1c0d8ae..0cbe919d81 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -45,3 +45,4 @@ def : Proc<"tahiti",     SI_Itin, [Feature64BitPtr, FeatureFP64]>;
 def : Proc<"pitcairn",   SI_Itin, [Feature64BitPtr, FeatureFP64]>;
 def : Proc<"verde",      SI_Itin, [Feature64BitPtr, FeatureFP64]>;
 def : Proc<"oland",      SI_Itin, [Feature64BitPtr, FeatureFP64]>;
+def : Proc<"hainan",     SI_Itin, [Feature64BitPtr, FeatureFP64]>;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index cdda3dab8d..ffe3414413 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -325,7 +325,7 @@ public:
   virtual bool runOnMachineFunction(MachineFunction &MF) {
     unsigned MaxStack = 0;
     unsigned CurrentStack = 0;
-    bool hasPush;
+    bool HasPush = false;
     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
         ++MB) {
       MachineBasicBlock &MBB = *MB;
@@ -337,6 +337,7 @@ public:
         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
             getHWInstrDesc(CF_CALL_FS));
         CfCount++;
+        MaxStack = 1;
       }
       std::vector<ClauseFile> FetchClauses, AluClauses;
       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
@@ -354,7 +355,7 @@ public:
         case AMDGPU::CF_ALU_PUSH_BEFORE:
           CurrentStack++;
           MaxStack = std::max(MaxStack, CurrentStack);
-          hasPush = true;
+          HasPush = true;
         case AMDGPU::CF_ALU:
           I = MI;
           AluClauses.push_back(MakeALUClause(MBB, I));
@@ -475,7 +476,7 @@ public:
           break;
         }
       }
-      MFI->StackSize = getHWStackSize(MaxStack, hasPush);
+      MFI->StackSize = getHWStackSize(MaxStack, HasPush);
     }
 
     return false;
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index c6e2136ff4..7252235d5b 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -43,6 +43,8 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::AND,  MVT::v4i32, Expand);
   setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Expand);
   setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Expand);
+  setOperationAction(ISD::MUL,  MVT::v2i32, Expand);
+  setOperationAction(ISD::MUL,  MVT::v4i32, Expand);
   setOperationAction(ISD::OR, MVT::v4i32, Expand);
   setOperationAction(ISD::OR, MVT::v2i32, Expand);
   setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Expand);
@@ -50,6 +52,10 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SHL, MVT::v2i32, Expand);
   setOperationAction(ISD::SRL, MVT::v4i32, Expand);
   setOperationAction(ISD::SRL, MVT::v2i32, Expand);
+  setOperationAction(ISD::SRA, MVT::v4i32, Expand);
+  setOperationAction(ISD::SRA, MVT::v2i32, Expand);
+  setOperationAction(ISD::SUB, MVT::v4i32, Expand);
+  setOperationAction(ISD::SUB, MVT::v2i32, Expand);
   setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Expand);
   setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
   setOperationAction(ISD::UREM, MVT::v4i32, Expand);
@@ -78,6 +84,9 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SELECT, MVT::i32, Custom);
   setOperationAction(ISD::SELECT, MVT::f32, Custom);
 
+  setOperationAction(ISD::VSELECT, MVT::v4i32, Expand);
+  setOperationAction(ISD::VSELECT, MVT::v2i32, Expand);
+
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 8a60add450..8f47523524 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1615,6 +1615,7 @@ let Predicates = [isEGorCayman] in {
                                                i32:$src2))],
     VecALU
   >;
+  def : BFEPattern <BFE_UINT_eg>;
 
   def BFI_INT_eg : R600_3OP <0x06, "BFI_INT", [], VecALU>;
   defm : BFIPatterns <BFI_INT_eg>;
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 16e1e42c50..42b4e73509 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -369,7 +369,14 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
 /// getCompactUnwindRegNum - Get the compact unwind number for a given
 /// register. The number corresponds to the enum lists in
 /// compact_unwind_encoding.h.
-static int getCompactUnwindRegNum(const uint16_t *CURegs, unsigned Reg) {
+static int getCompactUnwindRegNum(unsigned Reg, bool is64Bit) {
+  static const uint16_t CU32BitRegs[] = {
+    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+  };
+  static const uint16_t CU64BitRegs[] = {
+    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+  };
+  const uint16_t *CURegs = is64Bit ? CU64BitRegs : CU32BitRegs;
   for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
     if (*CURegs == Reg)
       return Idx;
@@ -398,16 +405,8 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
   //     4       3
   //     5       3
   //
-  static const uint16_t CU32BitRegs[] = {
-    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
-  };
-  static const uint16_t CU64BitRegs[] = {
-    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
-  };
-  const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
-
   for (unsigned i = 0; i != CU_NUM_SAVED_REGS; ++i) {
-    int CUReg = getCompactUnwindRegNum(CURegs, SavedRegs[i]);
+    int CUReg = getCompactUnwindRegNum(SavedRegs[i], Is64Bit);
     if (CUReg == -1) return ~0U;
     SavedRegs[i] = CUReg;
   }
@@ -466,14 +465,6 @@ encodeCompactUnwindRegistersWithoutFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
 static uint32_t
 encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
                                       bool Is64Bit) {
-  static const uint16_t CU32BitRegs[] = {
-    X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
-  };
-  static const uint16_t CU64BitRegs[] = {
-    X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
-  };
-  const uint16_t *CURegs = (Is64Bit ? CU64BitRegs : CU32BitRegs);
-
   // Encode the registers in the order they were saved, 3-bits per register. The
   // registers are numbered from 1 to CU_NUM_SAVED_REGS.
   uint32_t RegEnc = 0;
@@ -481,7 +472,7 @@ encodeCompactUnwindRegistersWithFrame(unsigned SavedRegs[CU_NUM_SAVED_REGS],
     unsigned Reg = SavedRegs[I];
     if (Reg == 0) continue;
 
-    int CURegNum = getCompactUnwindRegNum(CURegs, Reg);
+    int CURegNum = getCompactUnwindRegNum(Reg, Is64Bit);
     if (CURegNum == -1) return ~0U;
 
     // Encode the 3-bit register number in order, skipping over 3-bits for each
@@ -534,6 +525,12 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
       // If there are too many saved registers, we cannot use compact encoding.
       if (SavedRegIdx >= CU_NUM_SAVED_REGS) return CU::UNWIND_MODE_DWARF;
 
+      unsigned Reg = MI.getOperand(0).getReg();
+      if (Reg == (Is64Bit ? X86::RAX : X86::EAX)) {
+        ExpectEnd = true;
+        continue;
+      }
+
       SavedRegs[SavedRegIdx++] = MI.getOperand(0).getReg();
       StackAdjust += OffsetSize;
       InstrOffset += PushInstrSize;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 45fe69aa29..f69f5d85f7 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -9336,29 +9336,31 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
   if (Swap)
     std::swap(Op0, Op1);
 
-  // Since SSE has no unsigned integer comparisons, we need to flip  the sign
-  // bits of the inputs before performing those operations.
-  if (FlipSigns) {
-    EVT EltVT = VT.getVectorElementType();
-    SDValue SignBit = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()),
-                                      EltVT);
-    std::vector<SDValue> SignBits(VT.getVectorNumElements(), SignBit);
-    SDValue SignVec = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &SignBits[0],
-                                    SignBits.size());
-    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SignVec);
-    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SignVec);
-  }
-
   // Check that the operation in question is available (most are plain SSE2,
   // but PCMPGTQ and PCMPEQQ have different requirements).
   if (VT == MVT::v2i64) {
     if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42()) {
       assert(Subtarget->hasSSE2() && "Don't know how to lower!");
 
-      // First cast everything to the right type,
+      // First cast everything to the right type.
       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
 
+      // Since SSE has no unsigned integer comparisons, we need to flip the sign
+      // bits of the inputs before performing those operations. The lower
+      // compare is always unsigned.
+      SDValue SB;
+      if (FlipSigns) {
+        SB = DAG.getConstant(0x80000000U, MVT::v4i32);
+      } else {
+        SDValue Sign = DAG.getConstant(0x80000000U, MVT::i32);
+        SDValue Zero = DAG.getConstant(0x00000000U, MVT::i32);
+        SB = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
+                         Sign, Zero, Sign, Zero);
+      }
+      Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
+      Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
+
       // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
       SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
       SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
@@ -9384,7 +9386,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
       // pcmpeqd + pshufd + pand.
       assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
 
-      // First cast everything to the right type,
+      // First cast everything to the right type.
       Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
       Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
 
@@ -9403,6 +9405,15 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
     }
   }
 
+  // Since SSE has no unsigned integer comparisons, we need to flip the sign
+  // bits of the inputs before performing those operations.
+  if (FlipSigns) {
+    EVT EltVT = VT.getVectorElementType();
+    SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), VT);
+    Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
+    Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
+  }
+
   SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
 
   // If the logical-not of the result is required, perform that now.
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ad4a6c7c41..08d372512d 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -80,6 +80,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -382,7 +383,7 @@ public:
 
     // The starting value of the reduction.
     // It does not have to be zero!
-    Value *StartValue;
+    TrackingVH<Value> StartValue;
     // The instruction who's value is used outside the loop.
     Instruction *LoopExitInstr;
     // The kind of the reduction.
@@ -427,7 +428,7 @@ public:
     /// This flag indicates if we need to add the runtime check.
     bool Need;
     /// Holds the pointers that we need to check.
-    SmallVector<Value*, 2> Pointers;
+    SmallVector<TrackingVH<Value>, 2> Pointers;
     /// Holds the pointer value at the beginning of the loop.
     SmallVector<const SCEV*, 2> Starts;
     /// Holds the pointer value at the end of the loop.
@@ -441,7 +442,7 @@ public:
     InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
     InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
     /// Start value.
-    Value *StartValue;
+    TrackingVH<Value> StartValue;
     /// Induction kind.
     InductionKind IK;
   };
@@ -2307,7 +2308,10 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 }
 
 bool LoopVectorizationLegality::canVectorize() {
-  assert(TheLoop->getLoopPreheader() && "No preheader!!");
+  // We must have a loop in canonical form. Loops with indirectbr in them cannot
+  // be canonicalized.
+  if (!TheLoop->getLoopPreheader())
+    return false;
 
   // We can only vectorize innermost loops.
   if (TheLoop->getSubLoopsVector().size())
@@ -2374,6 +2378,26 @@ bool LoopVectorizationLegality::canVectorize() {
   return true;
 }
 
+/// \brief Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+                               SmallPtrSet<Value *, 4> &Reductions) {
+  // Reduction instructions are allowed to have exit users. All other
+  // instructions must not have external users.
+  if (!Reductions.count(Inst))
+    //Check that all of the users of the loop are inside the BB.
+    for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
+         I != E; ++I) {
+      Instruction *U = cast<Instruction>(*I);
+      // This user may be a reduction exit value.
+      if (!TheLoop->contains(U)) {
+        DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+        return true;
+      }
+    }
+  return false;
+}
+
 bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
   BasicBlock *Header = TheLoop->getHeader();
@@ -2412,8 +2436,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // If this PHINode is not in the header block, then we know that we
         // can convert it to select during if-conversion. No need to check if
         // the PHIs in this block are induction or reduction variables.
-        if (*bb != Header)
-          continue;
+        if (*bb != Header) {
+          // Check that this instruction has no outside users or is an
+          // identified reduction value with an outside user.
+          if(!hasOutsideLoopUser(TheLoop, it, AllowedExit))
+            continue;
+          return false;
+        }
 
         // We only allow if-converted PHIs with more than two incoming values.
         if (Phi->getNumIncomingValues() != 2) {
@@ -2506,17 +2535,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
-      if (!AllowedExit.count(it))
-        //Check that all of the users of the loop are inside the BB.
-        for (Value::use_iterator I = it->use_begin(), E = it->use_end();
-             I != E; ++I) {
-          Instruction *U = cast<Instruction>(*I);
-          // This user may be a reduction exit value.
-          if (!TheLoop->contains(U)) {
-            DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
-            return false;
-          }
-        }
+      if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+        return false;
+
     } // next instr.
 
   }
diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll
index 74628f0c5c..eb5ad8f0c3 100644
--- a/test/CodeGen/ARM/vmul.ll
+++ b/test/CodeGen/ARM/vmul.ll
@@ -599,3 +599,27 @@ for.end179:                                       ; preds = %for.cond.loopexit,
 declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
 declare <8 x i16> @llvm.arm.neon.vqsubu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
 declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone
+
+; vmull lowering would create a zext(v4i8 load()) instead of a zextload(v4i8),
+; creating an illegal type during legalization and causing an assert.
+; PR15970
+define void @no_illegal_types_vmull_sext(<4 x i32> %a) {
+entry:
+  %wide.load283.i = load <4 x i8>* undef, align 1
+  %0 = sext <4 x i8> %wide.load283.i to <4 x i32>
+  %1 = sub nsw <4 x i32> %0, %a
+  %2 = mul nsw <4 x i32> %1, %1
+  %predphi290.v.i = select <4 x i1> undef, <4 x i32> undef, <4 x i32> %2
+  store <4 x i32> %predphi290.v.i, <4 x i32>* undef, align 4
+  ret void
+}
+define void @no_illegal_types_vmull_zext(<4 x i32> %a) {
+entry:
+  %wide.load283.i = load <4 x i8>* undef, align 1
+  %0 = zext <4 x i8> %wide.load283.i to <4 x i32>
+  %1 = sub nsw <4 x i32> %0, %a
+  %2 = mul nsw <4 x i32> %1, %1
+  %predphi290.v.i = select <4 x i1> undef, <4 x i32> undef, <4 x i32> %2
+  store <4 x i32> %predphi290.v.i, <4 x i32>* undef, align 4
+  ret void
+}
diff --git a/test/CodeGen/Generic/annotate.ll b/test/CodeGen/Generic/annotate.ll
new file mode 100644
index 0000000000..c617eb0925
--- /dev/null
+++ b/test/CodeGen/Generic/annotate.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s
+
+; PR15253
+
+@.str = private unnamed_addr constant [4 x i8] c"sth\00", section "llvm.metadata"
+@.str1 = private unnamed_addr constant [4 x i8] c"t.c\00", section "llvm.metadata"
+
+
+define i32 @foo(i32 %a) {
+entry:
+  %0 = call i32 @llvm.annotation.i32(i32 %a, i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i32 0, i32 0), i32 2)
+  ret i32 %0
+}
+
+declare i32 @llvm.annotation.i32(i32, i8*, i8*, i32) #1
diff --git a/test/CodeGen/Generic/ptr-annotate.ll b/test/CodeGen/Generic/ptr-annotate.ll
new file mode 100644
index 0000000000..ac5bd5533e
--- /dev/null
+++ b/test/CodeGen/Generic/ptr-annotate.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s
+
+; PR15253
+
+%struct.mystruct = type { i32 }
+
+@.str = private unnamed_addr constant [4 x i8] c"sth\00", section "llvm.metadata"
+@.str1 = private unnamed_addr constant [4 x i8] c"t.c\00", section "llvm.metadata"
+
+define void @foo() {
+entry:
+  %m = alloca i8, align 4
+  %0 = call i8* @llvm.ptr.annotation.p0i8(i8* %m, i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i32 0, i32 0), i32 2)
+  store i8 1, i8* %0, align 4
+  ret void
+}
+
+declare i8* @llvm.ptr.annotation.p0i8(i8*, i8*, i8*, i32) #1
diff --git a/test/CodeGen/NVPTX/generic-to-nvvm.ll b/test/CodeGen/NVPTX/generic-to-nvvm.ll
new file mode 100644
index 0000000000..c9cb2f71f4
--- /dev/null
+++ b/test/CodeGen/NVPTX/generic-to-nvvm.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+; Ensure global variables in address space 0 are promoted to address space 1
+
+; CHECK: .global .align 4 .u32 myglobal = 42;
+@myglobal = internal global i32 42, align 4
+; CHECK: .global .align 4 .u32 myconst = 42;
+@myconst = internal constant i32 42, align 4
+
+
+define void @foo(i32* %a, i32* %b) {
+; CHECK: cvta.global.u32
+  %ld1 = load i32* @myglobal
+; CHECK: cvta.global.u32
+  %ld2 = load i32* @myconst
+  store i32 %ld1, i32* %a
+  store i32 %ld2, i32* %b
+  ret void
+}
+
+
+!nvvm.annotations = !{!0}
+!0 = metadata !{void (i32*, i32*)* @foo, metadata !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/i1-global.ll b/test/CodeGen/NVPTX/i1-global.ll
new file mode 100644
index 0000000000..0595325977
--- /dev/null
+++ b/test/CodeGen/NVPTX/i1-global.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+
+; CHECK: .visible .global .align 1 .u8 mypred
+@mypred = addrspace(1) global i1 true, align 1
+
+
+define void @foo(i1 %p, i32* %out) {
+  %ld = load i1 addrspace(1)* @mypred
+  %val = zext i1 %ld to i32
+  store i32 %val, i32* %out
+  ret void
+}
+
+
+!nvvm.annotations = !{!0}
+!0 = metadata !{void (i1, i32*)* @foo, metadata !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/i1-param.ll b/test/CodeGen/NVPTX/i1-param.ll
new file mode 100644
index 0000000000..fabd61a25d
--- /dev/null
+++ b/test/CodeGen/NVPTX/i1-param.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+; Make sure predicate (i1) operands to kernels get expanded out to .u8
+
+; CHECK: .entry foo
+; CHECK:   .param .u8 foo_param_0
+; CHECK:   .param .u32 foo_param_1
+define void @foo(i1 %p, i32* %out) {
+  %val = zext i1 %p to i32
+  store i32 %val, i32* %out
+  ret void
+}
+
+
+!nvvm.annotations = !{!0}
+!0 = metadata !{void (i1, i32*)* @foo, metadata !"kernel", i32 1}
diff --git a/test/CodeGen/NVPTX/intrinsics.ll b/test/CodeGen/NVPTX/intrinsics.ll
index 8b0357be87..1676f20643 100644
--- a/test/CodeGen/NVPTX/intrinsics.ll
+++ b/test/CodeGen/NVPTX/intrinsics.ll
@@ -15,5 +15,12 @@ define ptx_device double @test_fabs(double %d) {
 	ret double %x
 }
 
+define float @test_nvvm_sqrt(float %a) {
+  %val = call float @llvm.nvvm.sqrt.f(float %a)
+  ret float %val
+}
+
+
 declare float @llvm.fabs.f32(float)
 declare double @llvm.fabs.f64(double)
+declare float @llvm.nvvm.sqrt.f(float)
diff --git a/test/CodeGen/NVPTX/refl1.ll b/test/CodeGen/NVPTX/refl1.ll
new file mode 100644
index 0000000000..5a9dac152e
--- /dev/null
+++ b/test/CodeGen/NVPTX/refl1.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -drvcuda | FileCheck %s
+
+; Function Attrs: nounwind
+; CHECK: .entry foo
+define void @foo(float* nocapture %a) #0 {
+  %val = load float* %a
+  %tan = tail call fastcc float @__nv_fast_tanf(float %val)
+  store float %tan, float* %a
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.nvvm.sin.approx.ftz.f(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.nvvm.cos.approx.ftz.f(float) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.nvvm.div.approx.ftz.f(float, float) #1
+
+; Function Attrs: alwaysinline inlinehint nounwind readnone
+; CHECK: .func (.param .b32 func_retval0) __nv_fast_tanf
+define internal fastcc float @__nv_fast_tanf(float %a) #2 {
+entry:
+  %0 = tail call float @llvm.nvvm.sin.approx.ftz.f(float %a)
+  %1 = tail call float @llvm.nvvm.cos.approx.ftz.f(float %a)
+  %2 = tail call float @llvm.nvvm.div.approx.ftz.f(float %0, float %1)
+  ret float %2
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { alwaysinline inlinehint nounwind readnone }
+
+!nvvm.annotations = !{!0}
+
+!0 = metadata !{void (float*)* @foo, metadata !"kernel", i32 1}
diff --git a/test/CodeGen/R600/bfe_uint.ll b/test/CodeGen/R600/bfe_uint.ll
new file mode 100644
index 0000000000..92570c3152
--- /dev/null
+++ b/test/CodeGen/R600/bfe_uint.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @bfe_def
+; CHECK: BFE_UINT
+define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
+entry:
+  %0 = lshr i32 %x, 5
+  %1 = and i32 %0, 15 ; 0xf
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
+
+; This program could be implemented using a BFE_UINT instruction, however
+; since the lshr constant + number of bits in the mask is >= 32, it can also be
+; implmented with a LSHR instruction, which is better, because LSHR has less
+; operands and requires less constants.
+
+; CHECK: @bfe_shift
+; CHECK-NOT: BFE_UINT
+define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
+entry:
+  %0 = lshr i32 %x, 16
+  %1 = and i32 %0, 65535 ; 0xffff
+  store i32 %1, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/mul.ll b/test/CodeGen/R600/mul.ll
new file mode 100644
index 0000000000..7278e90398
--- /dev/null
+++ b/test/CodeGen/R600/mul.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; mul24 and mad24 are affected
+;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MULLO_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %result = mul <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/r600-encoding.ll b/test/CodeGen/R600/r600-encoding.ll
new file mode 100644
index 0000000000..c8040a1b4c
--- /dev/null
+++ b/test/CodeGen/R600/r600-encoding.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=redwood | FileCheck --check-prefix=EG-CHECK %s
+; RUN: llc < %s -march=r600 -show-mc-encoding -mcpu=rs880 | FileCheck --check-prefix=R600-CHECK %s
+
+; The earliest R600 GPUs have a slightly different encoding than the rest of
+; the VLIW4/5 GPUs.
+
+; EG-CHECK: @test
+; EG-CHECK: MUL_IEEE {{[ *TXYZW.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x01,0x[0-9a-f]+,0x[0-9a-f]+}}]
+
+; R600-CHECK: @test
+; R600-CHECK: MUL_IEEE {{[ *TXYZW.,0-9]+}} ; encoding: [{{0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x[0-9a-f]+,0x10,0x02,0x[0-9a-f]+,0x[0-9a-f]+}}]
+
+define void @test() {
+entry:
+  %0 = call float @llvm.R600.load.input(i32 0)
+  %1 = call float @llvm.R600.load.input(i32 1)
+  %2 = fmul float %0, %1
+  call void @llvm.AMDGPU.store.output(float %2, i32 0)
+  ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
diff --git a/test/CodeGen/R600/sra.ll b/test/CodeGen/R600/sra.ll
new file mode 100644
index 0000000000..972542d346
--- /dev/null
+++ b/test/CodeGen/R600/sra.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @ashr_v4i32
+; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: ASHR * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+  %result = ashr <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/sub.ll b/test/CodeGen/R600/sub.ll
new file mode 100644
index 0000000000..12bfba3975
--- /dev/null
+++ b/test/CodeGen/R600/sub.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: SUB_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: SUB_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %result = sub <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/vselect.ll b/test/CodeGen/R600/vselect.ll
new file mode 100644
index 0000000000..6e459df847
--- /dev/null
+++ b/test/CodeGen/R600/vselect.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: @test_select_v4i32
+; CHECK: CNDE_INT T{{[0-9]+\.[XYZW], PV\.[xyzw], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: CNDE_INT * T{{[0-9]+\.[XYZW], PV\.[xyzw], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: CNDE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+; CHECK: CNDE_INT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+entry:
+  %0 = load <4 x i32> addrspace(1)* %in0
+  %1 = load <4 x i32> addrspace(1)* %in1
+  %cmp = icmp ne <4 x i32> %0, %1
+  %result = select <4 x i1> %cmp, <4 x i32> %0, <4 x i32> %1
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/X86/compact-unwind.ll b/test/CodeGen/X86/compact-unwind.ll
new file mode 100644
index 0000000000..8c4fa27da5
--- /dev/null
+++ b/test/CodeGen/X86/compact-unwind.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -disable-cfi -disable-fp-elim -mtriple x86_64-apple-darwin11 | FileCheck %s
+
+%ty = type { i8* }
+
+@gv = external global i32
+
+; This is aligning the stack with a push of a random register.
+; CHECK: pushq %rax
+
+; Even though we can't encode %rax into the compact unwind, We still want to be
+; able to generate a compact unwind encoding in this particular case.
+;
+; CHECK: __LD,__compact_unwind
+; CHECK: _foo ## Range Start
+; CHECK: 16842753 ## Compact Unwind Encoding: 0x1010001
+
+define i8* @foo(i64 %size) {
+  %addr = alloca i64, align 8
+  %tmp20 = load i32* @gv, align 4
+  %tmp21 = call i32 @bar()
+  %tmp25 = load i64* %addr, align 8
+  %tmp26 = inttoptr i64 %tmp25 to %ty*
+  %tmp29 = getelementptr inbounds %ty* %tmp26, i64 0, i32 0
+  %tmp34 = load i8** %tmp29, align 8
+  %tmp35 = getelementptr inbounds i8* %tmp34, i64 %size
+  store i8* %tmp35, i8** %tmp29, align 8
+  ret i8* null
+}
+
+declare i32 @bar()
diff --git a/test/CodeGen/X86/vec_compare.ll b/test/CodeGen/X86/vec_compare.ll
index 85d8b2cea3..fd5c234bb1 100644
--- a/test/CodeGen/X86/vec_compare.ll
+++ b/test/CodeGen/X86/vec_compare.ll
@@ -67,7 +67,15 @@ define <2 x i64> @test6(<2 x i64> %A, <2 x i64> %B) nounwind {
 }
 
 define <2 x i64> @test7(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK: [[CONSTSEG:[A-Z0-9_]*]]:
+; CHECK:      .long	2147483648
+; CHECK-NEXT: .long	0
+; CHECK-NEXT: .long	2147483648
+; CHECK-NEXT: .long	0
 ; CHECK: test7:
+; CHECK: movdqa [[CONSTSEG]], [[CONSTREG:%xmm[0-9]*]]
+; CHECK: pxor [[CONSTREG]]
+; CHECK: pxor [[CONSTREG]]
 ; CHECK: pcmpgtd %xmm1
 ; CHECK: pshufd $-96
 ; CHECK: pcmpeqd
@@ -83,6 +91,8 @@ define <2 x i64> @test7(<2 x i64> %A, <2 x i64> %B) nounwind {
 
 define <2 x i64> @test8(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK: test8:
+; CHECK: pxor
+; CHECK: pxor
 ; CHECK: pcmpgtd %xmm0
 ; CHECK: pshufd $-96
 ; CHECK: pcmpeqd
@@ -98,6 +108,8 @@ define <2 x i64> @test8(<2 x i64> %A, <2 x i64> %B) nounwind {
 
 define <2 x i64> @test9(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK: test9:
+; CHECK: pxor
+; CHECK: pxor
 ; CHECK: pcmpgtd %xmm0
 ; CHECK: pshufd $-96
 ; CHECK: pcmpeqd
@@ -115,6 +127,8 @@ define <2 x i64> @test9(<2 x i64> %A, <2 x i64> %B) nounwind {
 
 define <2 x i64> @test10(<2 x i64> %A, <2 x i64> %B) nounwind {
 ; CHECK: test10:
+; CHECK: pxor
+; CHECK: pxor
 ; CHECK: pcmpgtd %xmm1
 ; CHECK: pshufd $-96
 ; CHECK: pcmpeqd
@@ -131,9 +145,15 @@ define <2 x i64> @test10(<2 x i64> %A, <2 x i64> %B) nounwind {
 }
 
 define <2 x i64> @test11(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK: [[CONSTSEG:[A-Z0-9_]*]]:
+; CHECK:      .long	2147483648
+; CHECK-NEXT: .long	2147483648
+; CHECK-NEXT: .long	2147483648
+; CHECK-NEXT: .long	2147483648
 ; CHECK: test11:
-; CHECK: pxor
-; CHECK: pxor
+; CHECK: movdqa [[CONSTSEG]], [[CONSTREG:%xmm[0-9]*]]
+; CHECK: pxor [[CONSTREG]]
+; CHECK: pxor [[CONSTREG]]
 ; CHECK: pcmpgtd %xmm1
 ; CHECK: pshufd $-96
 ; CHECK: pcmpeqd
diff --git a/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
index e7af892c10..620478a879 100644
--- a/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
+++ b/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -O0 %s -mtriple=x86_64-apple-darwin -filetype=obj -o %t
 ; RUN: llvm-dwarfdump %t | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-apple-macosx10.7 | FileCheck %s -check-prefix=ASM
 
 ; rdar://13067005
 ; CHECK: .debug_info contents:
@@ -20,6 +21,11 @@
 ; CHECK: file_names[  1]    0 0x00000000 0x00000000 simple2.c
 ; CHECK-NOT: file_names
 
+; PR15408
+; ASM: L__DWARF__debug_info_begin0:
+; ASM: .long   0                       ## DW_AT_stmt_list
+; ASM: L__DWARF__debug_info_begin1:
+; ASM: .long   0                       ## DW_AT_stmt_list
 define i32 @test(i32 %a) nounwind uwtable ssp {
 entry:
   %a.addr = alloca i32, align 4
diff --git a/test/ExecutionEngine/MCJIT/eh.ll b/test/ExecutionEngine/MCJIT/eh.ll
index 0c19b1bf2e..c2135736ad 100644
--- a/test/ExecutionEngine/MCJIT/eh.ll
+++ b/test/ExecutionEngine/MCJIT/eh.ll
@@ -1,5 +1,5 @@
 ; RUN: %lli_mcjit %s
-; XFAIL: arm, cygwin
+; XFAIL: arm, cygwin, win32, mingw
 declare i8* @__cxa_allocate_exception(i64)
 declare void @__cxa_throw(i8*, i8*, i8*)
 declare i32 @__gxx_personality_v0(...)
diff --git a/test/MC/AsmParser/exprs.s b/test/MC/AsmParser/exprs.s
index df075f85ec..a7e10020b6 100644
--- a/test/MC/AsmParser/exprs.s
+++ b/test/MC/AsmParser/exprs.s
@@ -45,6 +45,7 @@ k:
         check_expr 0 || 0, 0
         check_expr 1 + 2 < 3 + 4, 1
         check_expr 1 << 8 - 1, 128
+        check_expr 3 * 9 - 2 * 9 + 1, 10
 
         .set c, 10
         check_expr c + 1, 11
diff --git a/test/Transforms/LoopUnroll/scevunroll.ll b/test/Transforms/LoopUnroll/scevunroll.ll
index 99b3a7d861..308a036316 100644
--- a/test/Transforms/LoopUnroll/scevunroll.ll
+++ b/test/Transforms/LoopUnroll/scevunroll.ll
@@ -66,13 +66,16 @@ exit2:
 
 ; SCEV properly unrolls multi-exit loops.
 ;
+; SCEV cannot currently unroll this loop.
+; It should ideally detect a trip count of 5.
+; rdar:14038809 [SCEV]: Optimize trip count computation for multi-exit loops.
 ; CHECK: @multiExit
-; CHECK: getelementptr i32* %base, i32 10
-; CHECK-NEXT: load i32*
-; CHECK: br i1 false, label %l2.10, label %exit1
-; CHECK: l2.10:
-; CHECK-NOT: br
-; CHECK: ret i32
+; CHECKFIXME: getelementptr i32* %base, i32 10
+; CHECKFIXME-NEXT: load i32*
+; CHECKFIXME: br i1 false, label %l2.10, label %exit1
+; CHECKFIXME: l2.10:
+; CHECKFIXME-NOT: br
+; CHECKFIXME: ret i32
 define i32 @multiExit(i32* %base) nounwind {
 entry:
   br label %l1
@@ -170,3 +173,38 @@ for.body87:
   br label %for.body87
 }
 
+; PR16130: clang produces incorrect code with loop/expression at -O2
+; rdar:14036816 loop-unroll makes assumptions about undefined behavior
+;
+; The loop latch is assumed to exit after the first iteration because
+; of the induction variable's NSW flag. However, the loop latch's
+; equality test is skipped and the loop exits after the second
+; iteration via the early exit. So loop unrolling cannot assume that
+; the loop latch's exit count of zero is an upper bound on the number
+; of iterations.
+;
+; CHECK: @nsw_latch
+; CHECK: for.body:
+; CHECK: %b.03 = phi i32 [ 0, %entry ], [ %add, %for.cond ]
+; CHECK: return:
+; CHECK: %b.03.lcssa = phi i32 [ %b.03, %for.body ], [ %b.03, %for.cond ]
+define void @nsw_latch(i32* %a) nounwind {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.cond, %entry
+  %b.03 = phi i32 [ 0, %entry ], [ %add, %for.cond ]
+  %tobool = icmp eq i32 %b.03, 0
+  %add = add nsw i32 %b.03, 8
+  br i1 %tobool, label %for.cond, label %return
+
+for.cond:                                         ; preds = %for.body
+  %cmp = icmp eq i32 %add, 13
+  br i1 %cmp, label %return, label %for.body
+
+return:                                           ; preds = %for.body, %for.cond
+  %b.03.lcssa = phi i32 [ %b.03, %for.body ], [ %b.03, %for.cond ]
+  %retval.0 = phi i32 [ 1, %for.body ], [ 0, %for.cond ]
+  store i32 %b.03.lcssa, i32* %a, align 4
+  ret void
+}
diff --git a/test/Transforms/LoopUnroll/unloop.ll b/test/Transforms/LoopUnroll/unloop.ll
index 5a9cacda44..9a938cc287 100644
--- a/test/Transforms/LoopUnroll/unloop.ll
+++ b/test/Transforms/LoopUnroll/unloop.ll
@@ -21,8 +21,8 @@ outer:
 inner:
   %iv = phi i32 [ 0, %outer ], [ %inc, %tail ]
   %inc = add i32 %iv, 1
-  %wbucond = call zeroext i1 @check()
-  br i1 %wbucond, label %outer.backedge, label %tail
+  call zeroext i1 @check()
+  br i1 true, label %outer.backedge, label %tail
 
 tail:
   br i1 false, label %inner, label %exit
@@ -126,25 +126,27 @@ return:
 ; Ensure that only the middle loop is removed and rely on verify-loopinfo to
 ; check soundness.
 ;
-; CHECK: @unloopDeepNested
+; This test must be disabled until trip count computation can be optimized...
+; rdar:14038809 [SCEV]: Optimize trip count computation for multi-exit loops.
+; CHECKFIXME: @unloopDeepNested
 ; Inner-inner loop control.
-; CHECK: while.cond.us.i:
-; CHECK: br i1 %cmp.us.i, label %next_data.exit, label %while.body.us.i
-; CHECK: if.then.us.i:
-; CHECK: br label %while.cond.us.i
+; CHECKFIXME: while.cond.us.i:
+; CHECKFIXME: br i1 %cmp.us.i, label %next_data.exit, label %while.body.us.i
+; CHECKFIXME: if.then.us.i:
+; CHECKFIXME: br label %while.cond.us.i
 ; Inner loop tail.
-; CHECK: if.else.i:
-; CHECK: br label %while.cond.outer.i
+; CHECKFIXME: if.else.i:
+; CHECKFIXME: br label %while.cond.outer.i
 ; Middle loop control (removed).
-; CHECK: valid_data.exit:
-; CHECK-NOT: br
-; CHECK: %cmp = call zeroext i1 @check()
+; CHECKFIXME: valid_data.exit:
+; CHECKFIXME-NOT: br
+; CHECKFIXME: %cmp = call zeroext i1 @check()
 ; Outer loop control.
-; CHECK: copy_data.exit:
-; CHECK: br i1 %cmp38, label %if.then39, label %while.cond.outer
+; CHECKFIXME: copy_data.exit:
+; CHECKFIXME: br i1 %cmp38, label %if.then39, label %while.cond.outer
 ; Outer-outer loop tail.
-; CHECK: while.cond.outer.outer.backedge:
-; CHECK: br label %while.cond.outer.outer
+; CHECKFIXME: while.cond.outer.outer.backedge:
+; CHECKFIXME: br label %while.cond.outer.outer
 define void @unloopDeepNested() nounwind {
 for.cond8.preheader.i:
   %cmp113.i = call zeroext i1 @check()
diff --git a/test/Transforms/LoopVectorize/lcssa-crash.ll b/test/Transforms/LoopVectorize/lcssa-crash.ll
index 06b3b08aa0..de6be54849 100644
--- a/test/Transforms/LoopVectorize/lcssa-crash.ll
+++ b/test/Transforms/LoopVectorize/lcssa-crash.ll
@@ -27,3 +27,14 @@ for.end.i.i.i:
   unreachable
 }
 
+; PR16139
+define void @test2(i8* %x) {
+entry:
+  indirectbr i8* %x, [ label %L0, label %L1 ]
+
+L0:
+  br label %L0
+
+L1:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/no_outside_user.ll b/test/Transforms/LoopVectorize/no_outside_user.ll
new file mode 100644
index 0000000000..6f0357c5e5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/no_outside_user.ll
@@ -0,0 +1,41 @@
+; RUN: opt -S -loop-vectorize -force-vector-unroll=1 -force-vector-width=2 < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+
+@f = common global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+@c = common global i32 0, align 4
+@a = common global i32 0, align 4
+@b = common global i32 0, align 4
+@e = common global i32 0, align 4
+
+; We used to vectorize this loop. But it has a value that is used outside of the
+; and is not a recognized reduction variable "tmp17".
+
+; CHECK-NOT: <2 x i32>
+
+define i32 @main()  {
+bb:
+  %b.promoted = load i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %tmp8, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ]
+  %tmp18 = add nsw i32 %tmp8, 1
+  %tmp19 = icmp slt i32 %tmp18, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %tmp17, %bb16 ]
+  ret i32 %.lcssa
+}
+
+
diff --git a/test/Transforms/LoopVectorize/value-ptr-bug.ll b/test/Transforms/LoopVectorize/value-ptr-bug.ll
new file mode 100644
index 0000000000..f376656f07
--- /dev/null
+++ b/test/Transforms/LoopVectorize/value-ptr-bug.ll
@@ -0,0 +1,50 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -dce -instcombine < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; PR16073
+
+; Because we were caching value pointers accross a function call that could RAUW
+; we would generate an undefined value store below:
+; SCEVExpander::expandCodeFor would change a value (the start value of an
+; induction) that we cached in the induction variable list.
+
+; CHECK: test_vh
+; CHECK-NOT: store <4 x i8> undef
+
+define void @test_vh(i32* %ptr265, i32* %ptr266, i32 %sub267) {
+entry:
+  br label %loop
+
+loop:
+  %inc = phi i32 [ %sub267, %entry ], [ %add, %loop]
+  %ext.inc = sext i32 %inc to i64
+  %add.ptr265 = getelementptr inbounds i32* %ptr265, i64 %ext.inc
+  %add.ptr266 = getelementptr inbounds i32* %ptr266, i64 %ext.inc
+  %add = add i32 %inc, 9
+  %cmp = icmp slt i32 %add, 140
+  br i1 %cmp, label %block1, label %loop
+
+block1:
+  %sub267.lcssa = phi i32 [ %add, %loop ]
+  %add.ptr266.lcssa = phi i32* [ %add.ptr266, %loop ]
+  %add.ptr265.lcssa = phi i32* [ %add.ptr265, %loop ]
+  %tmp29 = bitcast i32* %add.ptr265.lcssa to i8*
+  %tmp30 = bitcast i32* %add.ptr266.lcssa to i8*
+  br label %do.body272
+
+do.body272:
+  %row_width.5 = phi i32 [ %sub267.lcssa, %block1 ], [ %dec, %do.body272 ]
+  %sp.4 = phi i8* [ %tmp30, %block1 ], [ %incdec.ptr273, %do.body272 ]
+  %dp.addr.4 = phi i8* [ %tmp29, %block1 ], [ %incdec.ptr274, %do.body272 ]
+  %incdec.ptr273 = getelementptr inbounds i8* %sp.4, i64 1
+  %tmp31 = load i8* %sp.4, align 1
+  %incdec.ptr274 = getelementptr inbounds i8* %dp.addr.4, i64 1
+  store i8 %tmp31, i8* %dp.addr.4, align 1
+  %dec = add i32 %row_width.5, -1
+  %cmp276 = icmp eq i32 %dec, 0
+  br i1 %cmp276, label %loop.exit, label %do.body272
+
+loop.exit:
+  ret void
+}
author	Abdoulaye Walsimou Gaye <awg@embtoolkit.org>	2013-06-12 21:27:20 +0200
committer	Abdoulaye Walsimou Gaye <awg@embtoolkit.org>	2013-06-12 21:27:20 +0200
commit	46ef71a74329a0777e3464c65927cd3e59c928d6 (patch)
tree	e8c06aeb70dcbe18acdf6db1f88a1452ac1dca68
parent	32c493313c30abe0830e95dc92c7ad1c2241ba57 (diff)
parent	ce337502f5a88500df9ab2f59ab48f97be0b4395 (diff)
download	llvm-46ef71a74329a0777e3464c65927cd3e59c928d6.tar.gz llvm-46ef71a74329a0777e3464c65927cd3e59c928d6.tar.bz2 llvm-46ef71a74329a0777e3464c65927cd3e59c928d6.tar.xz