From 7b837d8c75f78fe55c9b348b9ec2281169a48d2a Mon Sep 17 00:00:00 2001
From: Tim Northover <tnorthover@apple.com>
Date: Sat, 29 Mar 2014 10:18:08 +0000
Subject: ARM64: initial backend import

This adds a second implementation of the AArch64 architecture to LLVM,
accessible in parallel via the "arm64" triple. The plan over the
coming weeks & months is to merge the two into a single backend,
during which time thorough code review should naturally occur.

Everything will be easier with the target in-tree though, hence this
commit.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@205090 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/Analysis/CostModel/ARM64/lit.local.cfg        |    3 +
 test/Analysis/CostModel/ARM64/select.ll            |   38 +
 test/Analysis/CostModel/ARM64/store.ll             |   22 +
 test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll         |   47 +
 test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll   |   45 +
 .../ARM64/2011-03-21-Unaligned-Frame-Index.ll      |   12 +
 test/CodeGen/ARM64/2011-04-21-CPSRBug.ll           |   26 +
 test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll        |   31 +
 .../CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll |   40 +
 .../ARM64/2012-05-07-DAGCombineVectorExtract.ll    |   20 +
 test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll    |   21 +
 test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll       |   22 +
 test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll        |   50 +
 test/CodeGen/ARM64/2012-06-06-FPToUI.ll            |   65 +
 test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll   |   56 +
 test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll        |   19 +
 test/CodeGen/ARM64/2013-01-23-frem-crash.ll        |   15 +
 test/CodeGen/ARM64/2013-01-23-sext-crash.ll        |   37 +
 test/CodeGen/ARM64/2013-02-12-shufv8i8.ll          |   11 +
 test/CodeGen/ARM64/AdvSIMD-Scalar.ll               |   38 +
 test/CodeGen/ARM64/aapcs.ll                        |   86 +
 test/CodeGen/ARM64/abi-varargs.ll                  |  191 ++
 test/CodeGen/ARM64/abi.ll                          |  236 ++
 test/CodeGen/ARM64/abi_align.ll                    |  529 +++++
 test/CodeGen/ARM64/addp.ll                         |   32 +
 test/CodeGen/ARM64/addr-mode-folding.ll            |  171 ++
 test/CodeGen/ARM64/addr-type-promotion.ll          |   82 +
 test/CodeGen/ARM64/addrmode.ll                     |   72 +
 test/CodeGen/ARM64/alloc-no-stack-realign.ll       |   21 +
 test/CodeGen/ARM64/alloca-frame-pointer-offset.ll  |   29 +
 test/CodeGen/ARM64/andCmpBrToTBZ.ll                |   72 +
 test/CodeGen/ARM64/anyregcc-crash.ll               |   19 +
 test/CodeGen/ARM64/anyregcc.ll                     |  358 +++
 test/CodeGen/ARM64/arith-saturating.ll             |  153 ++
 test/CodeGen/ARM64/arith.ll                        |  262 +++
 test/CodeGen/ARM64/atomic-128.ll                   |  213 ++
 test/CodeGen/ARM64/atomic.ll                       |  343 +++
 test/CodeGen/ARM64/big-imm-offsets.ll              |   14 +
 test/CodeGen/ARM64/big-stack.ll                    |   21 +
 test/CodeGen/ARM64/bitfield-extract.ll             |  406 ++++
 test/CodeGen/ARM64/blockaddress.ll                 |   30 +
 test/CodeGen/ARM64/build-vector.ll                 |   35 +
 test/CodeGen/ARM64/call-tailcalls.ll               |   91 +
 test/CodeGen/ARM64/cast-opt.ll                     |   31 +
 test/CodeGen/ARM64/ccmp-heuristics.ll              |  190 ++
 test/CodeGen/ARM64/ccmp.ll                         |  289 +++
 test/CodeGen/ARM64/coalesce-ext.ll                 |   17 +
 test/CodeGen/ARM64/code-model-large-abs.ll         |   72 +
 test/CodeGen/ARM64/collect-loh-garbage-crash.ll    |   37 +
 test/CodeGen/ARM64/collect-loh-str.ll              |   23 +
 test/CodeGen/ARM64/collect-loh.ll                  |   47 +
 test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S  |   17 +
 test/CodeGen/ARM64/complex-ret.ll                  |    7 +
 test/CodeGen/ARM64/convert-v2f64-v2i32.ll          |   24 +
 test/CodeGen/ARM64/convert-v2i32-v2f64.ll          |   29 +
 test/CodeGen/ARM64/copy-tuple.ll                   |  146 ++
 test/CodeGen/ARM64/crc32.ll                        |   71 +
 test/CodeGen/ARM64/crypto.ll                       |  135 ++
 test/CodeGen/ARM64/cse.ll                          |   59 +
 test/CodeGen/ARM64/csel.ll                         |  222 ++
 test/CodeGen/ARM64/cvt.ll                          |  401 ++++
 test/CodeGen/ARM64/dagcombiner-convergence.ll      |   19 +
 test/CodeGen/ARM64/dagcombiner-load-slicing.ll     |  102 +
 test/CodeGen/ARM64/dup.ll                          |  322 +++
 test/CodeGen/ARM64/early-ifcvt.ll                  |  423 ++++
 test/CodeGen/ARM64/elf-calls.ll                    |   20 +
 test/CodeGen/ARM64/elf-constpool.ll                |   13 +
 test/CodeGen/ARM64/elf-globals.ll                  |  115 +
 test/CodeGen/ARM64/ext.ll                          |  101 +
 test/CodeGen/ARM64/extend-int-to-fp.ll             |   19 +
 test/CodeGen/ARM64/extend.ll                       |   15 +
 test/CodeGen/ARM64/extload-knownzero.ll            |   28 +
 test/CodeGen/ARM64/extract.ll                      |   58 +
 test/CodeGen/ARM64/extract_subvector.ll            |   51 +
 test/CodeGen/ARM64/fast-isel-addr-offset.ll        |   47 +
 test/CodeGen/ARM64/fast-isel-alloca.ll             |   24 +
 test/CodeGen/ARM64/fast-isel-br.ll                 |  155 ++
 test/CodeGen/ARM64/fast-isel-call.ll               |   91 +
 test/CodeGen/ARM64/fast-isel-conversion.ll         |  416 ++++
 test/CodeGen/ARM64/fast-isel-fcmp.ll               |  146 ++
 test/CodeGen/ARM64/fast-isel-gv.ll                 |   38 +
 test/CodeGen/ARM64/fast-isel-icmp.ll               |  214 ++
 test/CodeGen/ARM64/fast-isel-indirectbr.ll         |   36 +
 test/CodeGen/ARM64/fast-isel-intrinsic.ll          |  135 ++
 test/CodeGen/ARM64/fast-isel-materialize.ll        |   27 +
 test/CodeGen/ARM64/fast-isel-noconvert.ll          |   36 +
 test/CodeGen/ARM64/fast-isel-rem.ll                |   33 +
 test/CodeGen/ARM64/fast-isel-ret.ll                |   63 +
 test/CodeGen/ARM64/fast-isel-select.ll             |   63 +
 test/CodeGen/ARM64/fast-isel.ll                    |   95 +
 test/CodeGen/ARM64/fastcc-tailcall.ll              |   24 +
 .../ARM64/fastisel-gep-promote-before-add.ll       |   18 +
 test/CodeGen/ARM64/fcmp-opt.ll                     |  173 ++
 test/CodeGen/ARM64/fcopysign.ll                    |   51 +
 .../ARM64/fixed-point-scalar-cvt-dagcombine.ll     |   15 +
 test/CodeGen/ARM64/fmadd.ll                        |   74 +
 test/CodeGen/ARM64/fmax.ll                         |   21 +
 test/CodeGen/ARM64/fmuladd.ll                      |   88 +
 test/CodeGen/ARM64/fold-address.ll                 |   79 +
 test/CodeGen/ARM64/fold-lsl.ll                     |   79 +
 test/CodeGen/ARM64/fp-imm.ll                       |   21 +
 test/CodeGen/ARM64/fp.ll                           |    8 +
 test/CodeGen/ARM64/fp128-folding.ll                |   17 +
 test/CodeGen/ARM64/fp128.ll                        |  274 +++
 test/CodeGen/ARM64/frame-index.ll                  |   11 +
 test/CodeGen/ARM64/frameaddr.ll                    |   15 +
 test/CodeGen/ARM64/global-address.ll               |   14 +
 test/CodeGen/ARM64/hello.ll                        |   38 +
 test/CodeGen/ARM64/i16-subreg-extract.ll           |   12 +
 test/CodeGen/ARM64/icmp-opt.ll                     |   17 +
 test/CodeGen/ARM64/illegal-float-ops.ll            |  247 ++
 test/CodeGen/ARM64/indexed-memory.ll               |  351 +++
 test/CodeGen/ARM64/inline-asm-error-I.ll           |   11 +
 test/CodeGen/ARM64/inline-asm-error-J.ll           |   11 +
 test/CodeGen/ARM64/inline-asm-error-K.ll           |   11 +
 test/CodeGen/ARM64/inline-asm-error-L.ll           |   11 +
 test/CodeGen/ARM64/inline-asm-error-M.ll           |   11 +
 test/CodeGen/ARM64/inline-asm-error-N.ll           |   11 +
 test/CodeGen/ARM64/inline-asm-zero-reg-error.ll    |   11 +
 test/CodeGen/ARM64/inline-asm.ll                   |  230 ++
 test/CodeGen/ARM64/join-reserved.ll                |   17 +
 test/CodeGen/ARM64/jumptable.ll                    |   35 +
 test/CodeGen/ARM64/ld1.ll                          | 1254 ++++++++++
 test/CodeGen/ARM64/ldp.ll                          |  149 ++
 test/CodeGen/ARM64/ldur.ll                         |   67 +
 test/CodeGen/ARM64/ldxr-stxr.ll                    |  143 ++
 test/CodeGen/ARM64/leaf-compact-unwind.ll          |  161 ++
 test/CodeGen/ARM64/leaf.ll                         |   13 +
 test/CodeGen/ARM64/lit.local.cfg                   |    6 +
 test/CodeGen/ARM64/long-shift.ll                   |   59 +
 test/CodeGen/ARM64/memcpy-inline.ll                |  112 +
 test/CodeGen/ARM64/memset-inline.ll                |   27 +
 test/CodeGen/ARM64/memset-to-bzero.ll              |  101 +
 test/CodeGen/ARM64/movi.ll                         |  202 ++
 test/CodeGen/ARM64/mul.ll                          |   90 +
 test/CodeGen/ARM64/neon-compare-instructions.ll    | 1191 ++++++++++
 test/CodeGen/ARM64/patchpoint.ll                   |  163 ++
 test/CodeGen/ARM64/platform-reg.ll                 |   26 +
 test/CodeGen/ARM64/popcnt.ll                       |   43 +
 test/CodeGen/ARM64/prefetch.ll                     |   88 +
 test/CodeGen/ARM64/promote-const.ll                |  255 +++
 test/CodeGen/ARM64/redzone.ll                      |   18 +
 test/CodeGen/ARM64/register-offset-addressing.ll   |   12 +
 test/CodeGen/ARM64/register-pairing.ll             |   53 +
 test/CodeGen/ARM64/regress-f128csel-flags.ll       |   27 +
 test/CodeGen/ARM64/return-vector.ll                |   11 +
 test/CodeGen/ARM64/returnaddr.ll                   |   26 +
 test/CodeGen/ARM64/rev.ll                          |  221 ++
 test/CodeGen/ARM64/rounding.ll                     |  208 ++
 test/CodeGen/ARM64/scaled_iv.ll                    |   38 +
 test/CodeGen/ARM64/scvt.ll                         |  830 +++++++
 test/CodeGen/ARM64/shifted-sext.ll                 |  277 +++
 test/CodeGen/ARM64/simd-scalar-to-vector.ll        |   17 +
 test/CodeGen/ARM64/simplest-elf.ll                 |   18 +
 test/CodeGen/ARM64/sincos.ll                       |   31 +
 test/CodeGen/ARM64/sitofp-combine-chains.ll        |   22 +
 test/CodeGen/ARM64/sli-sri-opt.ll                  |   41 +
 test/CodeGen/ARM64/smaxv.ll                        |   74 +
 test/CodeGen/ARM64/sminv.ll                        |   74 +
 test/CodeGen/ARM64/spill-lr.ll                     |   74 +
 test/CodeGen/ARM64/spill.ll                        |   15 +
 test/CodeGen/ARM64/st1.ll                          |  628 +++++
 test/CodeGen/ARM64/stack-no-frame.ll               |   20 +
 test/CodeGen/ARM64/stackmap.ll                     |  281 +++
 test/CodeGen/ARM64/stacksave.ll                    |   20 +
 test/CodeGen/ARM64/stp.ll                          |  101 +
 test/CodeGen/ARM64/strict-align.ll                 |   25 +
 test/CodeGen/ARM64/stur.ll                         |   98 +
 test/CodeGen/ARM64/subvector-extend.ll             |  141 ++
 test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll       |   36 +
 test/CodeGen/ARM64/tbl.ll                          |  132 ++
 test/CodeGen/ARM64/this-return.ll                  |   83 +
 test/CodeGen/ARM64/tls-darwin.ll                   |   18 +
 test/CodeGen/ARM64/tls-dynamic-together.ll         |   18 +
 test/CodeGen/ARM64/tls-dynamics.ll                 |  135 ++
 test/CodeGen/ARM64/tls-execs.ll                    |   63 +
 test/CodeGen/ARM64/trap.ll                         |    8 +
 test/CodeGen/ARM64/trn.ll                          |  134 ++
 test/CodeGen/ARM64/trunc-store.ll                  |   75 +
 test/CodeGen/ARM64/umaxv.ll                        |   92 +
 test/CodeGen/ARM64/uminv.ll                        |   92 +
 test/CodeGen/ARM64/umov.ll                         |   33 +
 test/CodeGen/ARM64/unaligned_ldst.ll               |   41 +
 test/CodeGen/ARM64/uzp.ll                          |  107 +
 test/CodeGen/ARM64/vaargs.ll                       |   20 +
 test/CodeGen/ARM64/vabs.ll                         |  796 +++++++
 test/CodeGen/ARM64/vadd.ll                         |  941 ++++++++
 test/CodeGen/ARM64/vaddlv.ll                       |   26 +
 test/CodeGen/ARM64/vaddv.ll                        |  233 ++
 test/CodeGen/ARM64/variadic-aapcs.ll               |  143 ++
 test/CodeGen/ARM64/vbitwise.ll                     |   91 +
 test/CodeGen/ARM64/vclz.ll                         |  109 +
 test/CodeGen/ARM64/vcmp.ll                         |  227 ++
 test/CodeGen/ARM64/vcnt.ll                         |   56 +
 test/CodeGen/ARM64/vcombine.ll                     |   17 +
 test/CodeGen/ARM64/vcvt.ll                         |  686 ++++++
 test/CodeGen/ARM64/vcvt_f.ll                       |   82 +
 test/CodeGen/ARM64/vcvt_f32_su32.ll                |   73 +
 test/CodeGen/ARM64/vcvt_n.ll                       |   49 +
 test/CodeGen/ARM64/vcvt_su32_f32.ll                |   34 +
 test/CodeGen/ARM64/vcvtxd_f32_f64.ll               |   11 +
 test/CodeGen/ARM64/vecCmpBr.ll                     |  207 ++
 test/CodeGen/ARM64/vecFold.ll                      |  145 ++
 test/CodeGen/ARM64/vector-ext.ll                   |   16 +
 test/CodeGen/ARM64/vector-imm.ll                   |  134 ++
 test/CodeGen/ARM64/vector-ldst.ll                  |  601 +++++
 test/CodeGen/ARM64/vext.ll                         |  464 ++++
 test/CodeGen/ARM64/vfloatintrinsics.ll             |  375 +++
 test/CodeGen/ARM64/vhadd.ll                        |  249 ++
 test/CodeGen/ARM64/vhsub.ll                        |  125 +
 test/CodeGen/ARM64/virtual_base.ll                 |   51 +
 test/CodeGen/ARM64/vmax.ll                         |  679 ++++++
 test/CodeGen/ARM64/vminmaxnm.ll                    |   68 +
 test/CodeGen/ARM64/vmovn.ll                        |  242 ++
 test/CodeGen/ARM64/vmul.ll                         | 1969 ++++++++++++++++
 test/CodeGen/ARM64/volatile.ll                     |   27 +
 test/CodeGen/ARM64/vqadd.ll                        |  300 +++
 test/CodeGen/ARM64/vqsub.ll                        |  147 ++
 test/CodeGen/ARM64/vselect.ll                      |   18 +
 test/CodeGen/ARM64/vsetcc_fp.ll                    |   11 +
 test/CodeGen/ARM64/vshift.ll                       | 1909 ++++++++++++++++
 test/CodeGen/ARM64/vshr.ll                         |   49 +
 test/CodeGen/ARM64/vshuffle.ll                     |  115 +
 test/CodeGen/ARM64/vsqrt.ll                        |  177 ++
 test/CodeGen/ARM64/vsra.ll                         |  142 ++
 test/CodeGen/ARM64/vsub.ll                         |  417 ++++
 test/CodeGen/ARM64/weak-reference.ll               |   10 +
 test/CodeGen/ARM64/xaluo.ll                        |  524 +++++
 test/CodeGen/ARM64/zero-cycle-regmov.ll            |   17 +
 test/CodeGen/ARM64/zero-cycle-zeroing.ll           |   49 +
 test/CodeGen/ARM64/zext.ll                         |   11 +
 test/CodeGen/ARM64/zextload-unscaled.ll            |   40 +
 test/CodeGen/ARM64/zip.ll                          |  107 +
 test/DebugInfo/ARM64/lit.local.cfg                 |    4 +
 test/DebugInfo/ARM64/struct_by_value.ll            |   68 +
 test/MC/ARM64/advsimd.s                            | 1997 ++++++++++++++++
 test/MC/ARM64/aliases.s                            |  733 ++++++
 test/MC/ARM64/arithmetic-encoding.s                |  631 +++++
 test/MC/ARM64/arm64-fixup.s                        |   10 +
 test/MC/ARM64/basic-a64-instructions.s             |   18 +
 test/MC/ARM64/bitfield-encoding.s                  |   30 +
 test/MC/ARM64/branch-encoding.s                    |  159 ++
 test/MC/ARM64/crypto.s                             |   66 +
 test/MC/ARM64/diags.s                              |  242 ++
 test/MC/ARM64/directive_loh.s                      |   93 +
 test/MC/ARM64/elf-relocs.s                         |  249 ++
 test/MC/ARM64/fp-encoding.s                        |  507 +++++
 test/MC/ARM64/large-relocs.s                       |   38 +
 test/MC/ARM64/lit.local.cfg                        |    6 +
 test/MC/ARM64/logical-encoding.s                   |  224 ++
 test/MC/ARM64/mapping-across-sections.s            |   28 +
 test/MC/ARM64/mapping-within-section.s             |   23 +
 test/MC/ARM64/memory.s                             |  634 ++++++
 test/MC/ARM64/separator.s                          |   20 +
 test/MC/ARM64/simd-ldst.s                          | 2404 ++++++++++++++++++++
 test/MC/ARM64/small-data-fixups.s                  |   24 +
 test/MC/ARM64/system-encoding.s                    |  679 ++++++
 test/MC/ARM64/tls-modifiers-darwin.s               |   13 +
 test/MC/ARM64/tls-relocs.s                         |  320 +++
 test/MC/ARM64/variable-exprs.s                     |   40 +
 test/MC/Disassembler/ARM64/advsimd.txt             | 2282 +++++++++++++++++++
 test/MC/Disassembler/ARM64/arithmetic.txt          |  522 +++++
 test/MC/Disassembler/ARM64/bitfield.txt            |   29 +
 test/MC/Disassembler/ARM64/branch.txt              |   75 +
 test/MC/Disassembler/ARM64/crc32.txt               |   18 +
 test/MC/Disassembler/ARM64/crypto.txt              |   47 +
 test/MC/Disassembler/ARM64/invalid-logical.txt     |    6 +
 test/MC/Disassembler/ARM64/lit.local.cfg           |    5 +
 test/MC/Disassembler/ARM64/logical.txt             |  217 ++
 test/MC/Disassembler/ARM64/memory.txt              |  558 +++++
 test/MC/Disassembler/ARM64/scalar-fp.txt           |  255 +++
 test/MC/Disassembler/ARM64/system.txt              |   58 +
 .../MC/MachO/ARM64/darwin-ARM64-local-label-diff.s |   21 +
 test/MC/MachO/ARM64/darwin-ARM64-reloc.s           |  157 ++
 test/MC/MachO/ARM64/lit.local.cfg                  |    4 +
 test/Transforms/GlobalMerge/ARM/arm.ll             |   85 +
 test/Transforms/GlobalMerge/ARM/lit.local.cfg      |    4 +
 test/Transforms/GlobalMerge/ARM64/arm64.ll         |   88 +
 test/Transforms/GlobalMerge/ARM64/lit.local.cfg    |    4 +
 .../InstCombine/2012-04-23-Neon-Intrinsics.ll      |   69 +-
 test/Transforms/InstCombine/sincospi.ll            |    1 +
 .../LoopStrengthReduce/ARM64/lit.local.cfg         |    5 +
 .../LoopStrengthReduce/ARM64/lsr-memcpy.ll         |   33 +
 .../LoopStrengthReduce/ARM64/lsr-memset.ll         |  101 +
 test/Transforms/LoopVectorize/ARM64/gather-cost.ll |   85 +
 test/Transforms/LoopVectorize/ARM64/lit.local.cfg  |    6 +
 286 files changed, 46411 insertions(+), 3 deletions(-)
 create mode 100644 test/Analysis/CostModel/ARM64/lit.local.cfg
 create mode 100644 test/Analysis/CostModel/ARM64/select.ll
 create mode 100644 test/Analysis/CostModel/ARM64/store.ll
 create mode 100644 test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
 create mode 100644 test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
 create mode 100644 test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
 create mode 100644 test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
 create mode 100644 test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
 create mode 100644 test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
 create mode 100644 test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
 create mode 100644 test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
 create mode 100644 test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
 create mode 100644 test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
 create mode 100644 test/CodeGen/ARM64/2012-06-06-FPToUI.ll
 create mode 100644 test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
 create mode 100644 test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
 create mode 100644 test/CodeGen/ARM64/2013-01-23-frem-crash.ll
 create mode 100644 test/CodeGen/ARM64/2013-01-23-sext-crash.ll
 create mode 100644 test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
 create mode 100644 test/CodeGen/ARM64/AdvSIMD-Scalar.ll
 create mode 100644 test/CodeGen/ARM64/aapcs.ll
 create mode 100644 test/CodeGen/ARM64/abi-varargs.ll
 create mode 100644 test/CodeGen/ARM64/abi.ll
 create mode 100644 test/CodeGen/ARM64/abi_align.ll
 create mode 100644 test/CodeGen/ARM64/addp.ll
 create mode 100644 test/CodeGen/ARM64/addr-mode-folding.ll
 create mode 100644 test/CodeGen/ARM64/addr-type-promotion.ll
 create mode 100644 test/CodeGen/ARM64/addrmode.ll
 create mode 100644 test/CodeGen/ARM64/alloc-no-stack-realign.ll
 create mode 100644 test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
 create mode 100644 test/CodeGen/ARM64/andCmpBrToTBZ.ll
 create mode 100644 test/CodeGen/ARM64/anyregcc-crash.ll
 create mode 100644 test/CodeGen/ARM64/anyregcc.ll
 create mode 100644 test/CodeGen/ARM64/arith-saturating.ll
 create mode 100644 test/CodeGen/ARM64/arith.ll
 create mode 100644 test/CodeGen/ARM64/atomic-128.ll
 create mode 100644 test/CodeGen/ARM64/atomic.ll
 create mode 100644 test/CodeGen/ARM64/big-imm-offsets.ll
 create mode 100644 test/CodeGen/ARM64/big-stack.ll
 create mode 100644 test/CodeGen/ARM64/bitfield-extract.ll
 create mode 100644 test/CodeGen/ARM64/blockaddress.ll
 create mode 100644 test/CodeGen/ARM64/build-vector.ll
 create mode 100644 test/CodeGen/ARM64/call-tailcalls.ll
 create mode 100644 test/CodeGen/ARM64/cast-opt.ll
 create mode 100644 test/CodeGen/ARM64/ccmp-heuristics.ll
 create mode 100644 test/CodeGen/ARM64/ccmp.ll
 create mode 100644 test/CodeGen/ARM64/coalesce-ext.ll
 create mode 100644 test/CodeGen/ARM64/code-model-large-abs.ll
 create mode 100644 test/CodeGen/ARM64/collect-loh-garbage-crash.ll
 create mode 100644 test/CodeGen/ARM64/collect-loh-str.ll
 create mode 100644 test/CodeGen/ARM64/collect-loh.ll
 create mode 100644 test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
 create mode 100644 test/CodeGen/ARM64/complex-ret.ll
 create mode 100644 test/CodeGen/ARM64/convert-v2f64-v2i32.ll
 create mode 100644 test/CodeGen/ARM64/convert-v2i32-v2f64.ll
 create mode 100644 test/CodeGen/ARM64/copy-tuple.ll
 create mode 100644 test/CodeGen/ARM64/crc32.ll
 create mode 100644 test/CodeGen/ARM64/crypto.ll
 create mode 100644 test/CodeGen/ARM64/cse.ll
 create mode 100644 test/CodeGen/ARM64/csel.ll
 create mode 100644 test/CodeGen/ARM64/cvt.ll
 create mode 100644 test/CodeGen/ARM64/dagcombiner-convergence.ll
 create mode 100644 test/CodeGen/ARM64/dagcombiner-load-slicing.ll
 create mode 100644 test/CodeGen/ARM64/dup.ll
 create mode 100644 test/CodeGen/ARM64/early-ifcvt.ll
 create mode 100644 test/CodeGen/ARM64/elf-calls.ll
 create mode 100644 test/CodeGen/ARM64/elf-constpool.ll
 create mode 100644 test/CodeGen/ARM64/elf-globals.ll
 create mode 100644 test/CodeGen/ARM64/ext.ll
 create mode 100644 test/CodeGen/ARM64/extend-int-to-fp.ll
 create mode 100644 test/CodeGen/ARM64/extend.ll
 create mode 100644 test/CodeGen/ARM64/extload-knownzero.ll
 create mode 100644 test/CodeGen/ARM64/extract.ll
 create mode 100644 test/CodeGen/ARM64/extract_subvector.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-addr-offset.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-alloca.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-br.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-call.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-conversion.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-fcmp.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-gv.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-icmp.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-indirectbr.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-intrinsic.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-materialize.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-noconvert.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-rem.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-ret.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel-select.ll
 create mode 100644 test/CodeGen/ARM64/fast-isel.ll
 create mode 100644 test/CodeGen/ARM64/fastcc-tailcall.ll
 create mode 100644 test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
 create mode 100644 test/CodeGen/ARM64/fcmp-opt.ll
 create mode 100644 test/CodeGen/ARM64/fcopysign.ll
 create mode 100644 test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
 create mode 100644 test/CodeGen/ARM64/fmadd.ll
 create mode 100644 test/CodeGen/ARM64/fmax.ll
 create mode 100644 test/CodeGen/ARM64/fmuladd.ll
 create mode 100644 test/CodeGen/ARM64/fold-address.ll
 create mode 100644 test/CodeGen/ARM64/fold-lsl.ll
 create mode 100644 test/CodeGen/ARM64/fp-imm.ll
 create mode 100644 test/CodeGen/ARM64/fp.ll
 create mode 100644 test/CodeGen/ARM64/fp128-folding.ll
 create mode 100644 test/CodeGen/ARM64/fp128.ll
 create mode 100644 test/CodeGen/ARM64/frame-index.ll
 create mode 100644 test/CodeGen/ARM64/frameaddr.ll
 create mode 100644 test/CodeGen/ARM64/global-address.ll
 create mode 100644 test/CodeGen/ARM64/hello.ll
 create mode 100644 test/CodeGen/ARM64/i16-subreg-extract.ll
 create mode 100644 test/CodeGen/ARM64/icmp-opt.ll
 create mode 100644 test/CodeGen/ARM64/illegal-float-ops.ll
 create mode 100644 test/CodeGen/ARM64/indexed-memory.ll
 create mode 100644 test/CodeGen/ARM64/inline-asm-error-I.ll
 create mode 100644 test/CodeGen/ARM64/inline-asm-error-J.ll
 create mode 100644 test/CodeGen/ARM64/inline-asm-error-K.ll
 create mode 100644 test/CodeGen/ARM64/inline-asm-error-L.ll
 create mode 100644 test/CodeGen/ARM64/inline-asm-error-M.ll
 create mode 100644 test/CodeGen/ARM64/inline-asm-error-N.ll
 create mode 100644 test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
 create mode 100644 test/CodeGen/ARM64/inline-asm.ll
 create mode 100644 test/CodeGen/ARM64/join-reserved.ll
 create mode 100644 test/CodeGen/ARM64/jumptable.ll
 create mode 100644 test/CodeGen/ARM64/ld1.ll
 create mode 100644 test/CodeGen/ARM64/ldp.ll
 create mode 100644 test/CodeGen/ARM64/ldur.ll
 create mode 100644 test/CodeGen/ARM64/ldxr-stxr.ll
 create mode 100644 test/CodeGen/ARM64/leaf-compact-unwind.ll
 create mode 100644 test/CodeGen/ARM64/leaf.ll
 create mode 100644 test/CodeGen/ARM64/lit.local.cfg
 create mode 100644 test/CodeGen/ARM64/long-shift.ll
 create mode 100644 test/CodeGen/ARM64/memcpy-inline.ll
 create mode 100644 test/CodeGen/ARM64/memset-inline.ll
 create mode 100644 test/CodeGen/ARM64/memset-to-bzero.ll
 create mode 100644 test/CodeGen/ARM64/movi.ll
 create mode 100644 test/CodeGen/ARM64/mul.ll
 create mode 100644 test/CodeGen/ARM64/neon-compare-instructions.ll
 create mode 100644 test/CodeGen/ARM64/patchpoint.ll
 create mode 100644 test/CodeGen/ARM64/platform-reg.ll
 create mode 100644 test/CodeGen/ARM64/popcnt.ll
 create mode 100644 test/CodeGen/ARM64/prefetch.ll
 create mode 100644 test/CodeGen/ARM64/promote-const.ll
 create mode 100644 test/CodeGen/ARM64/redzone.ll
 create mode 100644 test/CodeGen/ARM64/register-offset-addressing.ll
 create mode 100644 test/CodeGen/ARM64/register-pairing.ll
 create mode 100644 test/CodeGen/ARM64/regress-f128csel-flags.ll
 create mode 100644 test/CodeGen/ARM64/return-vector.ll
 create mode 100644 test/CodeGen/ARM64/returnaddr.ll
 create mode 100644 test/CodeGen/ARM64/rev.ll
 create mode 100644 test/CodeGen/ARM64/rounding.ll
 create mode 100644 test/CodeGen/ARM64/scaled_iv.ll
 create mode 100644 test/CodeGen/ARM64/scvt.ll
 create mode 100644 test/CodeGen/ARM64/shifted-sext.ll
 create mode 100644 test/CodeGen/ARM64/simd-scalar-to-vector.ll
 create mode 100644 test/CodeGen/ARM64/simplest-elf.ll
 create mode 100644 test/CodeGen/ARM64/sincos.ll
 create mode 100644 test/CodeGen/ARM64/sitofp-combine-chains.ll
 create mode 100644 test/CodeGen/ARM64/sli-sri-opt.ll
 create mode 100644 test/CodeGen/ARM64/smaxv.ll
 create mode 100644 test/CodeGen/ARM64/sminv.ll
 create mode 100644 test/CodeGen/ARM64/spill-lr.ll
 create mode 100644 test/CodeGen/ARM64/spill.ll
 create mode 100644 test/CodeGen/ARM64/st1.ll
 create mode 100644 test/CodeGen/ARM64/stack-no-frame.ll
 create mode 100644 test/CodeGen/ARM64/stackmap.ll
 create mode 100644 test/CodeGen/ARM64/stacksave.ll
 create mode 100644 test/CodeGen/ARM64/stp.ll
 create mode 100644 test/CodeGen/ARM64/strict-align.ll
 create mode 100644 test/CodeGen/ARM64/stur.ll
 create mode 100644 test/CodeGen/ARM64/subvector-extend.ll
 create mode 100644 test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
 create mode 100644 test/CodeGen/ARM64/tbl.ll
 create mode 100644 test/CodeGen/ARM64/this-return.ll
 create mode 100644 test/CodeGen/ARM64/tls-darwin.ll
 create mode 100644 test/CodeGen/ARM64/tls-dynamic-together.ll
 create mode 100644 test/CodeGen/ARM64/tls-dynamics.ll
 create mode 100644 test/CodeGen/ARM64/tls-execs.ll
 create mode 100644 test/CodeGen/ARM64/trap.ll
 create mode 100644 test/CodeGen/ARM64/trn.ll
 create mode 100644 test/CodeGen/ARM64/trunc-store.ll
 create mode 100644 test/CodeGen/ARM64/umaxv.ll
 create mode 100644 test/CodeGen/ARM64/uminv.ll
 create mode 100644 test/CodeGen/ARM64/umov.ll
 create mode 100644 test/CodeGen/ARM64/unaligned_ldst.ll
 create mode 100644 test/CodeGen/ARM64/uzp.ll
 create mode 100644 test/CodeGen/ARM64/vaargs.ll
 create mode 100644 test/CodeGen/ARM64/vabs.ll
 create mode 100644 test/CodeGen/ARM64/vadd.ll
 create mode 100644 test/CodeGen/ARM64/vaddlv.ll
 create mode 100644 test/CodeGen/ARM64/vaddv.ll
 create mode 100644 test/CodeGen/ARM64/variadic-aapcs.ll
 create mode 100644 test/CodeGen/ARM64/vbitwise.ll
 create mode 100644 test/CodeGen/ARM64/vclz.ll
 create mode 100644 test/CodeGen/ARM64/vcmp.ll
 create mode 100644 test/CodeGen/ARM64/vcnt.ll
 create mode 100644 test/CodeGen/ARM64/vcombine.ll
 create mode 100644 test/CodeGen/ARM64/vcvt.ll
 create mode 100644 test/CodeGen/ARM64/vcvt_f.ll
 create mode 100644 test/CodeGen/ARM64/vcvt_f32_su32.ll
 create mode 100644 test/CodeGen/ARM64/vcvt_n.ll
 create mode 100644 test/CodeGen/ARM64/vcvt_su32_f32.ll
 create mode 100644 test/CodeGen/ARM64/vcvtxd_f32_f64.ll
 create mode 100644 test/CodeGen/ARM64/vecCmpBr.ll
 create mode 100644 test/CodeGen/ARM64/vecFold.ll
 create mode 100644 test/CodeGen/ARM64/vector-ext.ll
 create mode 100644 test/CodeGen/ARM64/vector-imm.ll
 create mode 100644 test/CodeGen/ARM64/vector-ldst.ll
 create mode 100644 test/CodeGen/ARM64/vext.ll
 create mode 100644 test/CodeGen/ARM64/vfloatintrinsics.ll
 create mode 100644 test/CodeGen/ARM64/vhadd.ll
 create mode 100644 test/CodeGen/ARM64/vhsub.ll
 create mode 100644 test/CodeGen/ARM64/virtual_base.ll
 create mode 100644 test/CodeGen/ARM64/vmax.ll
 create mode 100644 test/CodeGen/ARM64/vminmaxnm.ll
 create mode 100644 test/CodeGen/ARM64/vmovn.ll
 create mode 100644 test/CodeGen/ARM64/vmul.ll
 create mode 100644 test/CodeGen/ARM64/volatile.ll
 create mode 100644 test/CodeGen/ARM64/vqadd.ll
 create mode 100644 test/CodeGen/ARM64/vqsub.ll
 create mode 100644 test/CodeGen/ARM64/vselect.ll
 create mode 100644 test/CodeGen/ARM64/vsetcc_fp.ll
 create mode 100644 test/CodeGen/ARM64/vshift.ll
 create mode 100644 test/CodeGen/ARM64/vshr.ll
 create mode 100644 test/CodeGen/ARM64/vshuffle.ll
 create mode 100644 test/CodeGen/ARM64/vsqrt.ll
 create mode 100644 test/CodeGen/ARM64/vsra.ll
 create mode 100644 test/CodeGen/ARM64/vsub.ll
 create mode 100644 test/CodeGen/ARM64/weak-reference.ll
 create mode 100644 test/CodeGen/ARM64/xaluo.ll
 create mode 100644 test/CodeGen/ARM64/zero-cycle-regmov.ll
 create mode 100644 test/CodeGen/ARM64/zero-cycle-zeroing.ll
 create mode 100644 test/CodeGen/ARM64/zext.ll
 create mode 100644 test/CodeGen/ARM64/zextload-unscaled.ll
 create mode 100644 test/CodeGen/ARM64/zip.ll
 create mode 100644 test/DebugInfo/ARM64/lit.local.cfg
 create mode 100644 test/DebugInfo/ARM64/struct_by_value.ll
 create mode 100644 test/MC/ARM64/advsimd.s
 create mode 100644 test/MC/ARM64/aliases.s
 create mode 100644 test/MC/ARM64/arithmetic-encoding.s
 create mode 100644 test/MC/ARM64/arm64-fixup.s
 create mode 100644 test/MC/ARM64/basic-a64-instructions.s
 create mode 100644 test/MC/ARM64/bitfield-encoding.s
 create mode 100644 test/MC/ARM64/branch-encoding.s
 create mode 100644 test/MC/ARM64/crypto.s
 create mode 100644 test/MC/ARM64/diags.s
 create mode 100644 test/MC/ARM64/directive_loh.s
 create mode 100644 test/MC/ARM64/elf-relocs.s
 create mode 100644 test/MC/ARM64/fp-encoding.s
 create mode 100644 test/MC/ARM64/large-relocs.s
 create mode 100644 test/MC/ARM64/lit.local.cfg
 create mode 100644 test/MC/ARM64/logical-encoding.s
 create mode 100644 test/MC/ARM64/mapping-across-sections.s
 create mode 100644 test/MC/ARM64/mapping-within-section.s
 create mode 100644 test/MC/ARM64/memory.s
 create mode 100644 test/MC/ARM64/separator.s
 create mode 100644 test/MC/ARM64/simd-ldst.s
 create mode 100644 test/MC/ARM64/small-data-fixups.s
 create mode 100644 test/MC/ARM64/system-encoding.s
 create mode 100644 test/MC/ARM64/tls-modifiers-darwin.s
 create mode 100644 test/MC/ARM64/tls-relocs.s
 create mode 100644 test/MC/ARM64/variable-exprs.s
 create mode 100644 test/MC/Disassembler/ARM64/advsimd.txt
 create mode 100644 test/MC/Disassembler/ARM64/arithmetic.txt
 create mode 100644 test/MC/Disassembler/ARM64/bitfield.txt
 create mode 100644 test/MC/Disassembler/ARM64/branch.txt
 create mode 100644 test/MC/Disassembler/ARM64/crc32.txt
 create mode 100644 test/MC/Disassembler/ARM64/crypto.txt
 create mode 100644 test/MC/Disassembler/ARM64/invalid-logical.txt
 create mode 100644 test/MC/Disassembler/ARM64/lit.local.cfg
 create mode 100644 test/MC/Disassembler/ARM64/logical.txt
 create mode 100644 test/MC/Disassembler/ARM64/memory.txt
 create mode 100644 test/MC/Disassembler/ARM64/scalar-fp.txt
 create mode 100644 test/MC/Disassembler/ARM64/system.txt
 create mode 100644 test/MC/MachO/ARM64/darwin-ARM64-local-label-diff.s
 create mode 100644 test/MC/MachO/ARM64/darwin-ARM64-reloc.s
 create mode 100644 test/MC/MachO/ARM64/lit.local.cfg
 create mode 100644 test/Transforms/GlobalMerge/ARM/arm.ll
 create mode 100644 test/Transforms/GlobalMerge/ARM/lit.local.cfg
 create mode 100644 test/Transforms/GlobalMerge/ARM64/arm64.ll
 create mode 100644 test/Transforms/GlobalMerge/ARM64/lit.local.cfg
 create mode 100644 test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg
 create mode 100644 test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll
 create mode 100644 test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll
 create mode 100644 test/Transforms/LoopVectorize/ARM64/gather-cost.ll
 create mode 100644 test/Transforms/LoopVectorize/ARM64/lit.local.cfg

(limited to 'test')

diff --git a/test/Analysis/CostModel/ARM64/lit.local.cfg b/test/Analysis/CostModel/ARM64/lit.local.cfg
new file mode 100644
index 0000000000..84ac9811f0
--- /dev/null
+++ b/test/Analysis/CostModel/ARM64/lit.local.cfg
@@ -0,0 +1,3 @@
+targets = set(config.root.targets_to_build.split())
+if not 'ARM64' in targets:
+    config.unsupported = True
diff --git a/test/Analysis/CostModel/ARM64/select.ll b/test/Analysis/CostModel/ARM64/select.ll
new file mode 100644
index 0000000000..216dc5ddc4
--- /dev/null
+++ b/test/Analysis/CostModel/ARM64/select.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+
+; CHECK-LABEL: select
+define void @select() {
+    ; Scalar values
+  ; CHECK: cost of 1 {{.*}} select
+  %v1 = select i1 undef, i8 undef, i8 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v2 = select i1 undef, i16 undef, i16 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v3 = select i1 undef, i32 undef, i32 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v4 = select i1 undef, i64 undef, i64 undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v5 = select i1 undef, float undef, float undef
+  ; CHECK: cost of 1 {{.*}} select
+  %v6 = select i1 undef, double undef, double undef
+
+  ; Vector values - check for vectors that have a high cost because they end up
+  ; scalarized.
+  ; CHECK: cost of 320 {{.*}} select
+  %v13b = select <16 x i1>  undef, <16 x i16> undef, <16 x i16> undef
+
+  ; CHECK: cost of 160 {{.*}} select
+  %v15b = select <8 x i1>  undef, <8 x i32> undef, <8 x i32> undef
+  ; CHECK: cost of 320 {{.*}} select
+  %v15c = select <16 x i1>  undef, <16 x i32> undef, <16 x i32> undef
+
+  ; CHECK: cost of 80 {{.*}} select
+  %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+  ; CHECK: cost of 160 {{.*}} select
+  %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+  ; CHECK: cost of 320 {{.*}} select
+  %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+
+    ret void
+}
diff --git a/test/Analysis/CostModel/ARM64/store.ll b/test/Analysis/CostModel/ARM64/store.ll
new file mode 100644
index 0000000000..0c9883cf2a
--- /dev/null
+++ b/test/Analysis/CostModel/ARM64/store.ll
@@ -0,0 +1,22 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+; CHECK-LABEL: store
+define void @store() {
+    ; Stores of <2 x i64> should be expensive because we don't split them and
+    ; and unaligned 16b stores have bad performance.
+    ; CHECK: cost of 12 {{.*}} store
+    store <2 x i64> undef, <2 x i64> * undef
+
+    ; We scalarize the loads/stores because there is no vector register name for
+    ; these types (they get extended to v.4h/v.2s).
+    ; CHECK: cost of 16 {{.*}} store
+    store <2 x i8> undef, <2 x i8> * undef
+    ; CHECK: cost of 64 {{.*}} store
+    store <4 x i8> undef, <4 x i8> * undef
+    ; CHECK: cost of 16 {{.*}} load
+    load <2 x i8> * undef
+    ; CHECK: cost of 64 {{.*}} load
+    load <4 x i8> * undef
+
+    ret void
+}
diff --git a/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll b/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
new file mode 100644
index 0000000000..6fb7c3fb5e
--- /dev/null
+++ b/test/CodeGen/ARM64/2011-03-09-CPSRSpill.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin
+
+; Can't copy or spill / restore CPSR.
+; rdar://9105206
+
+define fastcc void @t() ssp align 2 {
+entry:
+  br i1 undef, label %bb3.i, label %bb2.i
+
+bb2.i:                                            ; preds = %entry
+  br label %bb3.i
+
+bb3.i:                                            ; preds = %bb2.i, %entry
+  br i1 undef, label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71, label %bb.i69
+
+bb.i69:                                           ; preds = %bb3.i
+  br label %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+
+_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71: ; preds = %bb.i69, %bb3.i
+  %0 = select i1 undef, float 0.000000e+00, float undef
+  %1 = fdiv float %0, undef
+  %2 = fcmp ult float %1, 0xBF847AE140000000
+  %storemerge9 = select i1 %2, float %1, float 0.000000e+00
+  store float %storemerge9, float* undef, align 4
+  br i1 undef, label %bb42, label %bb47
+
+bb42:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+  br i1 undef, label %bb46, label %bb53
+
+bb46:                                             ; preds = %bb42
+  br label %bb48
+
+bb47:                                             ; preds = %_ZN12gjkepa2_impl3EPA6appendERNS0_5sListEPNS0_5sFaceE.exit71
+  br label %bb48
+
+bb48:                                             ; preds = %bb47, %bb46
+  br i1 undef, label %bb1.i14, label %bb.i13
+
+bb.i13:                                           ; preds = %bb48
+  br label %bb1.i14
+
+bb1.i14:                                          ; preds = %bb.i13, %bb48
+  br label %bb53
+
+bb53:                                             ; preds = %bb1.i14, %bb42
+  ret void
+}
diff --git a/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll b/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
new file mode 100644
index 0000000000..2b083d8049
--- /dev/null
+++ b/test/CodeGen/ARM64/2011-03-17-AsmPrinterCrash.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin
+
+; rdar://9146594
+
+define void @drt_vsprintf() nounwind ssp {
+entry:
+  %do_tab_convert = alloca i32, align 4
+  br i1 undef, label %if.then24, label %if.else295, !dbg !13
+
+if.then24:                                        ; preds = %entry
+  unreachable
+
+if.else295:                                       ; preds = %entry
+  call void @llvm.dbg.declare(metadata !{i32* %do_tab_convert}, metadata !16), !dbg !18
+  store i32 0, i32* %do_tab_convert, align 4, !dbg !19
+  unreachable
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.gv = !{!0}
+!llvm.dbg.sp = !{!1, !7, !10, !11, !12}
+
+!0 = metadata !{i32 589876, i32 0, metadata !1, metadata !"vsplive", metadata !"vsplive", metadata !"", metadata !2, i32 617, metadata !6, i32 1, i32 1, null, null} ; [ DW_TAG_variable ]
+!1 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"drt_vsprintf", metadata !"drt_vsprintf", metadata !"", i32 616, metadata !4, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!2 = metadata !{i32 589865, metadata !20} ; [ DW_TAG_file_type ]
+!3 = metadata !{i32 589841, metadata !20, i32 12, metadata !"clang version 3.0 (http://llvm.org/git/clang.git git:/git/puzzlebox/clang.git/ c4d1aea01c4444eb81bdbf391f1be309127c3cf1)", i1 true, metadata !"", i32 0, metadata !21, metadata !21, null, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!4 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !5, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!5 = metadata !{metadata !6}
+!6 = metadata !{i32 589860, null, metadata !3, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
+!7 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"putc_mem", metadata !"putc_mem", metadata !"", i32 30, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!8 = metadata !{i32 589845, metadata !20, metadata !2, metadata !"", i32 0, i64 0, i64 0, i32 0, i32 0, null, metadata !9, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!9 = metadata !{null}
+!10 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_double", metadata !"print_double", metadata !"", i32 203, metadata !4, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!11 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"print_number", metadata !"print_number", metadata !"", i32 75, metadata !4, i1 true, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!12 = metadata !{i32 589870, metadata !20, metadata !2, metadata !"get_flags", metadata !"get_flags", metadata !"", i32 508, metadata !8, i1 true, i1 true, i32 0, i32 0, null, i32 256, i1 false, null, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
+!13 = metadata !{i32 653, i32 5, metadata !14, null}
+!14 = metadata !{i32 589835, metadata !20, metadata !15, i32 652, i32 35, i32 2} ; [ DW_TAG_lexical_block ]
+!15 = metadata !{i32 589835, metadata !20, metadata !1, i32 616, i32 1, i32 0} ; [ DW_TAG_lexical_block ]
+!16 = metadata !{i32 590080, metadata !17, metadata !"do_tab_convert", metadata !2, i32 853, metadata !6, i32 0, null} ; [ DW_TAG_auto_variable ]
+!17 = metadata !{i32 589835, metadata !20, metadata !14, i32 850, i32 12, i32 33} ; [ DW_TAG_lexical_block ]
+!18 = metadata !{i32 853, i32 11, metadata !17, null}
+!19 = metadata !{i32 853, i32 29, metadata !17, null}
+!20 = metadata !{metadata !"print.i", metadata !"/Volumes/Ebi/echeng/radars/r9146594"}
+!21 = metadata !{i32 0}
diff --git a/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll b/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
new file mode 100644
index 0000000000..6f0ec34fc1
--- /dev/null
+++ b/test/CodeGen/ARM64/2011-03-21-Unaligned-Frame-Index.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define void @foo(i64 %val) {
+; CHECK: foo
+;   The stack frame store is not 64-bit aligned. Make sure we use an
+;   instruction that can handle that.
+; CHECK: stur x0, [sp, #20]
+  %a = alloca [49 x i32], align 4
+  %p32 = getelementptr inbounds [49 x i32]* %a, i64 0, i64 2
+  %p = bitcast i32* %p32 to i64*
+  store i64 %val, i64* %p, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll b/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
new file mode 100644
index 0000000000..88232fcc0b
--- /dev/null
+++ b/test/CodeGen/ARM64/2011-04-21-CPSRBug.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=arm64-apple-iOS5.0
+
+; CPSR is not allocatable so fast allocatable wouldn't mark them killed.
+; rdar://9313272
+
+define hidden void @t() nounwind {
+entry:
+  %cmp = icmp eq i32* null, undef
+  %frombool = zext i1 %cmp to i8
+  store i8 %frombool, i8* undef, align 1
+  %tmp4 = load i8* undef, align 1
+  %tobool = trunc i8 %tmp4 to i1
+  br i1 %tobool, label %land.lhs.true, label %if.end
+
+land.lhs.true:                                    ; preds = %entry
+  unreachable
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %land.lhs.true14, label %if.end33
+
+land.lhs.true14:                                  ; preds = %if.end
+  unreachable
+
+if.end33:                                         ; preds = %if.end
+  unreachable
+}
diff --git a/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll b/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
new file mode 100644
index 0000000000..ea1cd02ca2
--- /dev/null
+++ b/test/CodeGen/ARM64/2011-10-18-LdStOptBug.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+
+; Can't fold the increment by 1<<12 into a post-increment load
+; rdar://10301335
+
+@test_data = common global i32 0, align 4
+
+define void @t() nounwind ssp {
+; CHECK-LABEL: t:
+entry:
+  br label %for.body
+
+for.body:
+; CHECK: for.body
+; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
+; CHECK: add x[[REG:[0-9]+]],
+; CHECK:                      x[[REG]], #4096
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 12
+  %add = add nsw i64 %0, 34628173824
+  %1 = inttoptr i64 %add to i32*
+  %2 = load volatile i32* %1, align 4096
+  store volatile i32 %2, i32* @test_data, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 200
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll b/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
new file mode 100644
index 0000000000..d47dbb2816
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-01-11-ComparisonDAGCrash.ll
@@ -0,0 +1,40 @@
+; RUN: llc < %s -march=arm64
+
+; The target lowering for integer comparisons was replacing some DAG nodes
+; during operation legalization, which resulted in dangling pointers,
+; cycles in DAGs, and eventually crashes.  This is the testcase for
+; one of those crashes. (rdar://10653656)
+
+define void @test(i1 zeroext %IsArrow) nounwind ssp align 2 {
+entry:
+  br i1 undef, label %return, label %lor.lhs.false
+
+lor.lhs.false:
+  br i1 undef, label %return, label %if.end
+
+if.end:
+  %tmp.i = load i64* undef, align 8
+  %and.i.i.i = and i64 %tmp.i, -16
+  br i1 %IsArrow, label %if.else_crit_edge, label %if.end32
+
+if.else_crit_edge:
+  br i1 undef, label %if.end32, label %return
+
+if.end32:
+  %0 = icmp ult i32 undef, 3
+  %1 = zext i64 %tmp.i to i320
+  %.pn.v = select i1 %0, i320 128, i320 64
+  %.pn = shl i320 %1, %.pn.v
+  %ins346392 = or i320 %.pn, 0
+  store i320 %ins346392, i320* undef, align 8
+  br i1 undef, label %sw.bb.i.i, label %exit
+
+sw.bb.i.i:
+  unreachable
+
+exit:
+  unreachable
+
+return:
+  ret void
+}
diff --git a/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll b/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
new file mode 100644
index 0000000000..a4d37e4868
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-05-07-DAGCombineVectorExtract.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i32 @foo(<4 x i32> %a, i32 %n) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: fmov w0, s0
+; CHECK-NEXT: ret
+  %b = bitcast <4 x i32> %a to i128
+  %c = trunc i128 %b to i32
+  ret i32 %c
+}
+
+define i64 @bar(<2 x i64> %a, i64 %n) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: fmov x0, d0
+; CHECK-NEXT: ret
+  %b = bitcast <2 x i64> %a to i128
+  %c = trunc i128 %b to i64
+  ret i64 %c
+}
+
diff --git a/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll b/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
new file mode 100644
index 0000000000..d59b0d0043
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-05-07-MemcpyAlignBug.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march arm64 -mcpu=cyclone | FileCheck %s
+; <rdar://problem/11294426>
+
+@b = private unnamed_addr constant [3 x i32] [i32 1768775988, i32 1685481784, i32 1836253201], align 4
+
+; The important thing for this test is that we need an unaligned load of `l_b'
+; ("ldr w2, [x1, #8]" in this case).
+
+; CHECK:      adrp x[[PAGE:[0-9]+]], {{l_b@PAGE|.Lb}}
+; CHECK: add  x[[ADDR:[0-9]+]], x[[PAGE]], {{l_b@PAGEOFF|:lo12:.Lb}}
+; CHECK-NEXT: ldr  [[VAL:w[0-9]+]], [x[[ADDR]], #8]
+; CHECK-NEXT: str  [[VAL]], [x0, #8]
+; CHECK-NEXT: ldr  [[VAL2:x[0-9]+]], [x[[ADDR]]]
+; CHECK-NEXT: str  [[VAL2]], [x0]
+
+define void @foo(i8* %a) {
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast ([3 x i32]* @b to i8*), i64 12, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll b/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
new file mode 100644
index 0000000000..d1840d3594
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-05-09-LOADgot-bug.ll
@@ -0,0 +1,22 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; <rdar://problem/11392109>
+
+define hidden void @t() optsize ssp {
+entry:
+  store i64 zext (i32 ptrtoint (i64 (i32)* @x to i32) to i64), i64* undef, align 8
+; CHECK:             adrp    x{{[0-9]+}}, _x@GOTPAGE
+; CHECK:        ldr     x{{[0-9]+}}, [x{{[0-9]+}}, _x@GOTPAGEOFF]
+; CHECK-NEXT:        and     x{{[0-9]+}}, x{{[0-9]+}}, #0xffffffff
+; CHECK-NEXT:        str     x{{[0-9]+}}, [x{{[0-9]+}}]
+  unreachable
+}
+
+declare i64 @x(i32) optsize
+
+; Worth checking the Linux code is sensible too: only way to access
+; the GOT is via a 64-bit load. Just loading wN is unacceptable
+; (there's no ELF relocation to do that).
+
+; CHECK-LINUX: adrp {{x[0-9]+}}, :got:x
+; CHECK-LINUX: ldr {{x[0-9]+}}, [{{x[0-9]+}}, :got_lo12:x]
diff --git a/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll b/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
new file mode 100644
index 0000000000..4b037db9c8
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-05-22-LdStOptBug.ll
@@ -0,0 +1,50 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios -verify-machineinstrs | FileCheck %s
+
+; LdStOpt bug created illegal instruction:
+;   %D1<def>, %D2<def> = LDPSi %X0, 1
+; rdar://11512047
+
+%0 = type opaque
+%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
+%struct.CGPoint = type { double, double }
+%struct.CGSize = type { double, double }
+
+@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+define hidden %struct.CGRect @t(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: ldp d{{[0-9]+}}, d{{[0-9]+}}
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr.sum = add i64 %ivar, 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr.sum17 = add i64 %ivar, 16
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.sum = add i64 %ivar, 24
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{}
diff --git a/test/CodeGen/ARM64/2012-06-06-FPToUI.ll b/test/CodeGen/ARM64/2012-06-06-FPToUI.ll
new file mode 100644
index 0000000000..dda4ff5bad
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-06-06-FPToUI.ll
@@ -0,0 +1,65 @@
+; RUN: llc -march=arm64 -O0 < %s | FileCheck %s
+; RUN: llc -march=arm64 -O3 < %s | FileCheck %s
+
+@.str = private unnamed_addr constant [9 x i8] c"%lf %lu\0A\00", align 1
+@.str1 = private unnamed_addr constant [8 x i8] c"%lf %u\0A\00", align 1
+@.str2 = private unnamed_addr constant [8 x i8] c"%f %lu\0A\00", align 1
+@.str3 = private unnamed_addr constant [7 x i8] c"%f %u\0A\00", align 1
+
+define void @testDouble(double %d) ssp {
+; CHECK:  fcvtzu x{{.}}, d{{.}}
+; CHECK:  fcvtzu w{{.}}, d{{.}}
+entry:
+  %d.addr = alloca double, align 8
+  store double %d, double* %d.addr, align 8
+  %0 = load double* %d.addr, align 8
+  %1 = load double* %d.addr, align 8
+  %conv = fptoui double %1 to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([9 x i8]* @.str, i32 0, i32 0), double %0, i64 %conv)
+  %2 = load double* %d.addr, align 8
+  %3 = load double* %d.addr, align 8
+  %conv1 = fptoui double %3 to i32
+  %call2 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str1, i32 0, i32 0), double %2, i32 %conv1)
+  ret void
+}
+
+declare i32 @printf(i8*, ...)
+
+define void @testFloat(float %f) ssp {
+; CHECK:  fcvtzu x{{.}}, s{{.}}
+; CHECK:  fcvtzu w{{.}}, s{{.}}
+entry:
+  %f.addr = alloca float, align 4
+  store float %f, float* %f.addr, align 4
+  %0 = load float* %f.addr, align 4
+  %conv = fpext float %0 to double
+  %1 = load float* %f.addr, align 4
+  %conv1 = fptoui float %1 to i64
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([8 x i8]* @.str2, i32 0, i32 0), double %conv, i64 %conv1)
+  %2 = load float* %f.addr, align 4
+  %conv2 = fpext float %2 to double
+  %3 = load float* %f.addr, align 4
+  %conv3 = fptoui float %3 to i32
+  %call4 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str3, i32 0, i32 0), double %conv2, i32 %conv3)
+  ret void
+}
+
+define i32 @main(i32 %argc, i8** %argv) ssp {
+entry:
+  %retval = alloca i32, align 4
+  %argc.addr = alloca i32, align 4
+  %argv.addr = alloca i8**, align 8
+  store i32 0, i32* %retval
+  store i32 %argc, i32* %argc.addr, align 4
+  store i8** %argv, i8*** %argv.addr, align 8
+  call void @testDouble(double 1.159198e+01)
+  call void @testFloat(float 0x40272F1800000000)
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
diff --git a/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll b/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
new file mode 100644
index 0000000000..55ecfb5d2b
--- /dev/null
+++ b/test/CodeGen/ARM64/2012-07-11-InstrEmitterBug.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios
+; rdar://11849816
+
+@shlib_path_substitutions = external hidden unnamed_addr global i8**, align 8
+
+declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readnone
+
+declare noalias i8* @xmalloc(i64) optsize
+
+declare i64 @strlen(i8* nocapture) nounwind readonly optsize
+
+declare i8* @__strcpy_chk(i8*, i8*, i64) nounwind optsize
+
+declare i8* @__strcat_chk(i8*, i8*, i64) nounwind optsize
+
+declare noalias i8* @xstrdup(i8*) optsize
+
+define i8* @dyld_fix_path(i8* %path) nounwind optsize ssp {
+entry:
+  br i1 undef, label %if.end56, label %for.cond
+
+for.cond:                                         ; preds = %entry
+  br i1 undef, label %for.cond10, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  unreachable
+
+for.cond10:                                       ; preds = %for.cond
+  br i1 undef, label %if.end56, label %for.body14
+
+for.body14:                                       ; preds = %for.cond10
+  %call22 = tail call i64 @strlen(i8* undef) nounwind optsize
+  %sext = shl i64 %call22, 32
+  %conv30 = ashr exact i64 %sext, 32
+  %add29 = sub i64 0, %conv30
+  %sub = add i64 %add29, 0
+  %add31 = shl i64 %sub, 32
+  %sext59 = add i64 %add31, 4294967296
+  %conv33 = ashr exact i64 %sext59, 32
+  %call34 = tail call noalias i8* @xmalloc(i64 %conv33) nounwind optsize
+  br i1 undef, label %cond.false45, label %cond.true43
+
+cond.true43:                                      ; preds = %for.body14
+  unreachable
+
+cond.false45:                                     ; preds = %for.body14
+  %add.ptr = getelementptr inbounds i8* %path, i64 %conv30
+  unreachable
+
+if.end56:                                         ; preds = %for.cond10, %entry
+  ret i8* null
+}
+
+declare i32 @strncmp(i8* nocapture, i8* nocapture, i64) nounwind readonly optsize
+
+declare i8* @strcpy(i8*, i8* nocapture) nounwind
diff --git a/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll b/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
new file mode 100644
index 0000000000..b40a581d61
--- /dev/null
+++ b/test/CodeGen/ARM64/2013-01-13-ffast-fcmp.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -fp-contract=fast | FileCheck %s --check-prefix=FAST
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+;FAST-LABEL: _Z9example25v:
+;FAST: fcmgt.4s
+;FAST: ret
+
+;CHECK-LABEL: _Z9example25v:
+;CHECK: fcmgt.4s
+;CHECK: ret
+
+define <4 x i32> @_Z9example25v( <4 x float> %N0,  <4 x float> %N1) {
+  %A = fcmp olt <4 x float> %N0, %N1
+  %B = zext <4 x i1> %A to <4 x i32>
+  ret <4 x i32> %B
+}
diff --git a/test/CodeGen/ARM64/2013-01-23-frem-crash.ll b/test/CodeGen/ARM64/2013-01-23-frem-crash.ll
new file mode 100644
index 0000000000..94511243a4
--- /dev/null
+++ b/test/CodeGen/ARM64/2013-01-23-frem-crash.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64
+; Make sure we are not crashing on this test.
+
+define void @autogen_SD13158() {
+entry:
+  %B26 = frem float 0.000000e+00, undef
+  br i1 undef, label %CF, label %CF77
+
+CF:                                               ; preds = %CF, %CF76
+  store float %B26, float* undef
+  br i1 undef, label %CF, label %CF77
+
+CF77:                                             ; preds = %CF
+  ret void
+}
diff --git a/test/CodeGen/ARM64/2013-01-23-sext-crash.ll b/test/CodeGen/ARM64/2013-01-23-sext-crash.ll
new file mode 100644
index 0000000000..404027bfd5
--- /dev/null
+++ b/test/CodeGen/ARM64/2013-01-23-sext-crash.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=arm64
+
+; Make sure we are not crashing on this test.
+
+define void @autogen_SD12881() {
+BB:
+  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
+  br label %CF
+
+CF:                                               ; preds = %CF83, %CF, %BB
+  br i1 undef, label %CF, label %CF83
+
+CF83:                                             ; preds = %CF
+  %FC70 = sitofp <4 x i32> %B17 to <4 x double>
+  br label %CF
+}
+
+
+define void @autogen_SD12881_2() {
+BB:
+  %B17 = ashr <4 x i32> zeroinitializer, zeroinitializer
+  br label %CF
+
+CF:                                               ; preds = %CF83, %CF, %BB
+  br i1 undef, label %CF, label %CF83
+
+CF83:                                             ; preds = %CF
+  %FC70 = uitofp <4 x i32> %B17 to <4 x double>
+  br label %CF
+}
+
+define void @_Z12my_example2bv() nounwind noinline ssp {
+entry:
+  %0 = fptosi <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll b/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
new file mode 100644
index 0000000000..70e745fc57
--- /dev/null
+++ b/test/CodeGen/ARM64/2013-02-12-shufv8i8.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple
+
+;CHECK-LABEL: Shuff:
+;CHECK: tbl.8b
+;CHECK: ret
+define <8 x i8 > @Shuff(<8 x i8> %in, <8 x i8>* %out) nounwind ssp {
+  %value = shufflevector <8 x i8> %in, <8 x i8> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i8> %value
+}
+
+
diff --git a/test/CodeGen/ARM64/AdvSIMD-Scalar.ll b/test/CodeGen/ARM64/AdvSIMD-Scalar.ll
new file mode 100644
index 0000000000..6397ac54d3
--- /dev/null
+++ b/test/CodeGen/ARM64/AdvSIMD-Scalar.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -arm64-simd-scalar=true -asm-verbose=false | FileCheck %s
+;
+define <2 x i64> @bar(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: bar:
+; CHECK: add.2d	v[[REG:[0-9]+]], v0, v1
+; CHECK: add	d[[REG3:[0-9]+]], d[[REG]], d1
+; CHECK: sub	d[[REG2:[0-9]+]], d[[REG]], d1
+  %add = add <2 x i64> %a, %b
+  %vgetq_lane = extractelement <2 x i64> %add, i32 0
+  %vgetq_lane2 = extractelement <2 x i64> %b, i32 0
+  %add3 = add i64 %vgetq_lane, %vgetq_lane2
+  %sub = sub i64 %vgetq_lane, %vgetq_lane2
+  %vecinit = insertelement <2 x i64> undef, i64 %add3, i32 0
+  %vecinit8 = insertelement <2 x i64> %vecinit, i64 %sub, i32 1
+  ret <2 x i64> %vecinit8
+}
+
+define double @subdd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: subdd_su64:
+; CHECK: sub d0, d1, d0
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %sub.i = sub nsw i64 %vecext1, %vecext
+  %retval = bitcast i64 %sub.i to double
+  ret double %retval
+}
+
+define double @vaddd_su64(<2 x i64> %a, <2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vaddd_su64:
+; CHECK: add d0, d1, d0
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 0
+  %vecext1 = extractelement <2 x i64> %b, i32 0
+  %add.i = add nsw i64 %vecext1, %vecext
+  %retval = bitcast i64 %add.i to double
+  ret double %retval
+}
diff --git a/test/CodeGen/ARM64/aapcs.ll b/test/CodeGen/ARM64/aapcs.ll
new file mode 100644
index 0000000000..27d2aa7b77
--- /dev/null
+++ b/test/CodeGen/ARM64/aapcs.ll
@@ -0,0 +1,86 @@
+; RUN: llc -mtriple=arm64-linux-gnu -enable-misched=false < %s | FileCheck %s
+
+@var = global i32 0, align 4
+
+define i128 @test_i128_align(i32, i128 %arg, i32 %after) {
+  store i32 %after, i32* @var, align 4
+; CHECK: str w4, [{{x[0-9]+}}, :lo12:var]
+
+  ret i128 %arg
+; CHECK: mov x0, x2
+; CHECK: mov x1, x3
+}
+
+@var64 = global i64 0, align 8
+
+  ; Check stack slots are 64-bit at all times.
+define void @test_stack_slots([8 x i32], i1 %bool, i8 %char, i16 %short,
+                                i32 %int, i64 %long) {
+  ; Part of last store. Blasted scheduler.
+; CHECK: ldr [[LONG:x[0-9]+]], [sp, #32]
+
+  %ext_bool = zext i1 %bool to i64
+  store volatile i64 %ext_bool, i64* @var64, align 8
+; CHECK: ldr w[[EXT:[0-9]+]], [sp]
+; CHECK: and x[[EXTED:[0-9]+]], x[[EXT]], #0x1
+; CHECK: str x[[EXTED]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_char = zext i8 %char to i64
+  store volatile i64 %ext_char, i64* @var64, align 8
+; CHECK: ldrb w[[EXT:[0-9]+]], [sp, #8]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_short = zext i16 %short to i64
+  store volatile i64 %ext_short, i64* @var64, align 8
+; CHECK: ldrh w[[EXT:[0-9]+]], [sp, #16]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_int = zext i32 %int to i64
+  store volatile i64 %ext_int, i64* @var64, align 8
+; CHECK: ldr w[[EXT:[0-9]+]], [sp, #24]
+; CHECK: str x[[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  store volatile i64 %long, i64* @var64, align 8
+; CHECK: str [[LONG]], [{{x[0-9]+}}, :lo12:var64]
+
+  ret void
+}
+
+; Make sure the callee does extensions (in the absence of zext/sext
+; keyword on args) while we're here.
+
+define void @test_extension(i1 %bool, i8 %char, i16 %short, i32 %int) {
+  %ext_bool = zext i1 %bool to i64
+  store volatile i64 %ext_bool, i64* @var64
+; CHECK: and [[EXT:x[0-9]+]], x0, #0x1
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_char = sext i8 %char to i64
+  store volatile i64 %ext_char, i64* @var64
+; CHECK: sxtb [[EXT:x[0-9]+]], x1
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_short = zext i16 %short to i64
+  store volatile i64 %ext_short, i64* @var64
+; CHECK: and [[EXT:x[0-9]+]], x2, #0xffff
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  %ext_int = zext i32 %int to i64
+  store volatile i64 %ext_int, i64* @var64
+; CHECK: uxtw [[EXT:x[0-9]+]], x3
+; CHECK: str [[EXT]], [{{x[0-9]+}}, :lo12:var64]
+
+  ret void
+}
+
+declare void @variadic(i32 %a, ...)
+
+  ; Under AAPCS variadic functions have the same calling convention as
+  ; others. The extra arguments should go in registers rather than on the stack.
+define void @test_variadic() {
+  call void(i32, ...)* @variadic(i32 0, i64 1, double 2.0)
+; CHECK: fmov d0, #2.0
+; CHECK: orr x1, xzr, #0x1
+; CHECK: bl variadic
+  ret void
+}
diff --git a/test/CodeGen/ARM64/abi-varargs.ll b/test/CodeGen/ARM64/abi-varargs.ll
new file mode 100644
index 0000000000..92db392cd0
--- /dev/null
+++ b/test/CodeGen/ARM64/abi-varargs.ll
@@ -0,0 +1,191 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+target triple = "arm64-apple-ios7.0.0"
+
+; rdar://13625505
+; Here we have 9 fixed integer arguments the 9th argument in on stack, the
+; varargs start right after at 8-byte alignment.
+define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
+; CHECK-LABEL: fn9:
+; 9th fixed argument
+; CHECK: ldr {{w[0-9]+}}, [sp, #64]
+; CHECK: add [[ARGS:x[0-9]+]], sp, #72
+; CHECK: add {{x[0-9]+}}, [[ARGS]], #8
+; First vararg
+; CHECK: ldr {{w[0-9]+}}, [sp, #72]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Second vararg
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
+; Third vararg
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  %6 = alloca i32, align 4
+  %7 = alloca i32, align 4
+  %8 = alloca i32, align 4
+  %9 = alloca i32, align 4
+  %args = alloca i8*, align 8
+  %a10 = alloca i32, align 4
+  %a11 = alloca i32, align 4
+  %a12 = alloca i32, align 4
+  store i32 %a1, i32* %1, align 4
+  store i32 %a2, i32* %2, align 4
+  store i32 %a3, i32* %3, align 4
+  store i32 %a4, i32* %4, align 4
+  store i32 %a5, i32* %5, align 4
+  store i32 %a6, i32* %6, align 4
+  store i32 %a7, i32* %7, align 4
+  store i32 %a8, i32* %8, align 4
+  store i32 %a9, i32* %9, align 4
+  %10 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %10)
+  %11 = va_arg i8** %args, i32
+  store i32 %11, i32* %a10, align 4
+  %12 = va_arg i8** %args, i32
+  store i32 %12, i32* %a11, align 4
+  %13 = va_arg i8** %args, i32
+  store i32 %13, i32* %a12, align 4
+  ret void
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+define i32 @main() nounwind ssp {
+; CHECK-LABEL: main:
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp, #8]
+; CHECK: str {{w[0-9]+}}, [sp]
+  %a1 = alloca i32, align 4
+  %a2 = alloca i32, align 4
+  %a3 = alloca i32, align 4
+  %a4 = alloca i32, align 4
+  %a5 = alloca i32, align 4
+  %a6 = alloca i32, align 4
+  %a7 = alloca i32, align 4
+  %a8 = alloca i32, align 4
+  %a9 = alloca i32, align 4
+  %a10 = alloca i32, align 4
+  %a11 = alloca i32, align 4
+  %a12 = alloca i32, align 4
+  store i32 1, i32* %a1, align 4
+  store i32 2, i32* %a2, align 4
+  store i32 3, i32* %a3, align 4
+  store i32 4, i32* %a4, align 4
+  store i32 5, i32* %a5, align 4
+  store i32 6, i32* %a6, align 4
+  store i32 7, i32* %a7, align 4
+  store i32 8, i32* %a8, align 4
+  store i32 9, i32* %a9, align 4
+  store i32 10, i32* %a10, align 4
+  store i32 11, i32* %a11, align 4
+  store i32 12, i32* %a12, align 4
+  %1 = load i32* %a1, align 4
+  %2 = load i32* %a2, align 4
+  %3 = load i32* %a3, align 4
+  %4 = load i32* %a4, align 4
+  %5 = load i32* %a5, align 4
+  %6 = load i32* %a6, align 4
+  %7 = load i32* %a7, align 4
+  %8 = load i32* %a8, align 4
+  %9 = load i32* %a9, align 4
+  %10 = load i32* %a10, align 4
+  %11 = load i32* %a11, align 4
+  %12 = load i32* %a12, align 4
+  call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...)* @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
+  ret i32 0
+}
+
+;rdar://13668483
+@.str = private unnamed_addr constant [4 x i8] c"fmt\00", align 1
+define void @foo(i8* %fmt, ...) nounwind {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
+; CHECK: ldr {{w[0-9]+}}, [sp, #48]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
+; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+  %fmt.addr = alloca i8*, align 8
+  %args = alloca i8*, align 8
+  %vc = alloca i32, align 4
+  %vv = alloca <4 x i32>, align 16
+  store i8* %fmt, i8** %fmt.addr, align 8
+  %args1 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %args1)
+  %0 = va_arg i8** %args, i32
+  store i32 %0, i32* %vc, align 4
+  %1 = va_arg i8** %args, <4 x i32>
+  store <4 x i32> %1, <4 x i32>* %vv, align 16
+  ret void
+}
+
+define void @bar(i32 %x, <4 x i32> %y) nounwind {
+entry:
+; CHECK-LABEL: bar:
+; CHECK: str {{q[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp]
+  %x.addr = alloca i32, align 4
+  %y.addr = alloca <4 x i32>, align 16
+  store i32 %x, i32* %x.addr, align 4
+  store <4 x i32> %y, <4 x i32>* %y.addr, align 16
+  %0 = load i32* %x.addr, align 4
+  %1 = load <4 x i32>* %y.addr, align 16
+  call void (i8*, ...)* @foo(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %0, <4 x i32> %1)
+  ret void
+}
+
+; rdar://13668927
+; When passing 16-byte aligned small structs as vararg, make sure the caller
+; side is 16-byte aligned on stack.
+%struct.s41 = type { i32, i16, i32, i16 }
+define void @foo2(i8* %fmt, ...) nounwind {
+entry:
+; CHECK-LABEL: foo2:
+; CHECK: orr {{x[0-9]+}}, {{x[0-9]+}}, #0x8
+; CHECK: ldr {{w[0-9]+}}, [sp, #48]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #15
+; CHECK: and x[[ADDR:[0-9]+]], {{x[0-9]+}}, #0xfffffffffffffff0
+; CHECK: ldr {{q[0-9]+}}, [x[[ADDR]]]
+  %fmt.addr = alloca i8*, align 8
+  %args = alloca i8*, align 8
+  %vc = alloca i32, align 4
+  %vs = alloca %struct.s41, align 16
+  store i8* %fmt, i8** %fmt.addr, align 8
+  %args1 = bitcast i8** %args to i8*
+  call void @llvm.va_start(i8* %args1)
+  %0 = va_arg i8** %args, i32
+  store i32 %0, i32* %vc, align 4
+  %ap.cur = load i8** %args
+  %1 = getelementptr i8* %ap.cur, i32 15
+  %2 = ptrtoint i8* %1 to i64
+  %3 = and i64 %2, -16
+  %ap.align = inttoptr i64 %3 to i8*
+  %ap.next = getelementptr i8* %ap.align, i32 16
+  store i8* %ap.next, i8** %args
+  %4 = bitcast i8* %ap.align to %struct.s41*
+  %5 = bitcast %struct.s41* %vs to i8*
+  %6 = bitcast %struct.s41* %4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* %6, i64 16, i32 16, i1 false)
+  ret void
+}
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define void @bar2(i32 %x, i128 %s41.coerce) nounwind {
+entry:
+; CHECK-LABEL: bar2:
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{x[0-9]+}}, [sp]
+  %x.addr = alloca i32, align 4
+  %s41 = alloca %struct.s41, align 16
+  store i32 %x, i32* %x.addr, align 4
+  %0 = bitcast %struct.s41* %s41 to i128*
+  store i128 %s41.coerce, i128* %0, align 1
+  %1 = load i32* %x.addr, align 4
+  %2 = bitcast %struct.s41* %s41 to i128*
+  %3 = load i128* %2, align 1
+  call void (i8*, ...)* @foo2(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i32 %1, i128 %3)
+  ret void
+}
diff --git a/test/CodeGen/ARM64/abi.ll b/test/CodeGen/ARM64/abi.ll
new file mode 100644
index 0000000000..a7693b6ba9
--- /dev/null
+++ b/test/CodeGen/ARM64/abi.ll
@@ -0,0 +1,236 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+target triple = "arm64-apple-darwin"
+
+; rdar://9932559
+define i64 @i8i16callee(i64 %a1, i64 %a2, i64 %a3, i8 signext %a4, i16 signext %a5, i64 %a6, i64 %a7, i64 %a8, i8 signext %b1, i16 signext %b2, i8 signext %b3, i8 signext %b4) nounwind readnone noinline {
+entry:
+; CHECK-LABEL: i8i16callee:
+; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
+; They are i8, i16, i8 and i8.
+; CHECK: ldrsb	{{w[0-9]+}}, [sp, #5]
+; CHECK: ldrsh	{{w[0-9]+}}, [sp, #2]
+; CHECK: ldrsb	{{w[0-9]+}}, [sp]
+; CHECK: ldrsb	{{w[0-9]+}}, [sp, #4]
+; FAST-LABEL: i8i16callee:
+; FAST: ldrb  {{w[0-9]+}}, [sp, #5]
+; FAST: ldrb  {{w[0-9]+}}, [sp, #4]
+; FAST: ldrh  {{w[0-9]+}}, [sp, #2]
+; FAST: ldrb  {{w[0-9]+}}, [sp]
+  %conv = sext i8 %a4 to i64
+  %conv3 = sext i16 %a5 to i64
+  %conv8 = sext i8 %b1 to i64
+  %conv9 = sext i16 %b2 to i64
+  %conv11 = sext i8 %b3 to i64
+  %conv13 = sext i8 %b4 to i64
+  %add10 = add i64 %a2, %a1
+  %add12 = add i64 %add10, %a3
+  %add14 = add i64 %add12, %conv
+  %add = add i64 %add14, %conv3
+  %add1 = add i64 %add, %a6
+  %add2 = add i64 %add1, %a7
+  %add4 = add i64 %add2, %a8
+  %add5 = add i64 %add4, %conv8
+  %add6 = add i64 %add5, %conv9
+  %add7 = add i64 %add6, %conv11
+  %add15 = add i64 %add7, %conv13
+  %sext = shl i64 %add15, 32
+  %conv17 = ashr exact i64 %sext, 32
+  ret i64 %conv17
+}
+
+define i32 @i8i16caller() nounwind readnone {
+entry:
+; CHECK: i8i16caller
+; The 8th, 9th, 10th and 11th arguments are passed at sp, sp+2, sp+4, sp+5.
+; They are i8, i16, i8 and i8.
+; CHECK: strb {{w[0-9]+}}, [sp, #5]
+; CHECK: strb {{w[0-9]+}}, [sp, #4]
+; CHECK: strh {{w[0-9]+}}, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp]
+; CHECK: bl
+; FAST: i8i16caller
+; FAST: strb {{w[0-9]+}}, [sp]
+; FAST: strh {{w[0-9]+}}, [sp, #2]
+; FAST: strb {{w[0-9]+}}, [sp, #4]
+; FAST: strb {{w[0-9]+}}, [sp, #5]
+; FAST: bl
+  %call = tail call i64 @i8i16callee(i64 0, i64 1, i64 2, i8 signext 3, i16 signext 4, i64 5, i64 6, i64 7, i8 signext 97, i16 signext 98, i8 signext 99, i8 signext 100)
+  %conv = trunc i64 %call to i32
+  ret i32 %conv
+}
+
+; rdar://12651543
+define double @circle_center([2 x float] %a) nounwind ssp {
+  %call = tail call double @ext([2 x float] %a) nounwind
+; CHECK: circle_center
+; CHECK: bl
+  ret double %call
+}
+declare double @ext([2 x float])
+
+; rdar://12656141
+; 16-byte vector should be aligned at 16-byte when passing on stack.
+; A double argument will be passed on stack, so vecotr should be at sp+16.
+define double @fixed_4i(<4 x i32>* nocapture %in) nounwind {
+entry:
+; CHECK: fixed_4i
+; CHECK: str [[REG_1:q[0-9]+]], [sp, #16]
+; FAST: fixed_4i
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str [[REG_1:q[0-9]+]], [x[[ADDR]], #16]
+  %0 = load <4 x i32>* %in, align 16
+  %call = tail call double @args_vec_4i(double 3.000000e+00, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, <4 x i32> %0, double 3.000000e+00, <4 x i32> %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_vec_4i(double, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, double, <4 x i32>, i8 signext)
+
+; rdar://12695237
+; d8 at sp, i in register w0.
+@g_d = common global double 0.000000e+00, align 8
+define void @test1(float %f1, double %d1, double %d2, double %d3, double %d4,
+       double %d5, double %d6, double %d7, double %d8, i32 %i) nounwind ssp {
+entry:
+; CHECK: test1
+; CHECK: ldr [[REG_1:d[0-9]+]], [sp]
+; CHECK: scvtf [[REG_2:s[0-9]+]], w0
+; CHECK: fadd s0, [[REG_2]], s0
+  %conv = sitofp i32 %i to float
+  %add = fadd float %conv, %f1
+  %conv1 = fpext float %add to double
+  %add2 = fadd double %conv1, %d7
+  %add3 = fadd double %add2, %d8
+  store double %add3, double* @g_d, align 8
+  ret void
+}
+
+; i9 at sp, d1 in register s0.
+define void @test2(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+            i32 %i7, i32 %i8, i32 %i9, float %d1) nounwind ssp {
+entry:
+; CHECK: test2
+; CHECK: scvtf [[REG_2:s[0-9]+]], w0
+; CHECK: fadd s0, [[REG_2]], s0
+; CHECK: ldr [[REG_1:s[0-9]+]], [sp]
+  %conv = sitofp i32 %i1 to float
+  %add = fadd float %conv, %d1
+  %conv1 = fpext float %add to double
+  %conv2 = sitofp i32 %i8 to double
+  %add3 = fadd double %conv2, %conv1
+  %conv4 = sitofp i32 %i9 to double
+  %add5 = fadd double %conv4, %add3
+  store double %add5, double* @g_d, align 8
+  ret void
+}
+
+; rdar://12648441
+; Check alignment on stack for v64, f64, i64, f32, i32.
+define double @test3(<2 x i32>* nocapture %in) nounwind {
+entry:
+; CHECK: test3
+; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
+; FAST: test3
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str [[REG_1:d[0-9]+]], [x[[ADDR]], #8]
+  %0 = load <2 x i32>* %in, align 8
+  %call = tail call double @args_vec_2i(double 3.000000e+00, <2 x i32> %0,
+          <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0, <2 x i32> %0,
+          <2 x i32> %0, float 3.000000e+00, <2 x i32> %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_vec_2i(double, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>,
+               <2 x i32>, <2 x i32>, <2 x i32>, float, <2 x i32>, i8 signext)
+
+define double @test4(double* nocapture %in) nounwind {
+entry:
+; CHECK: test4
+; CHECK: str [[REG_1:d[0-9]+]], [sp, #8]
+; CHECK: str [[REG_2:w[0-9]+]], [sp]
+; CHECK: orr w0, wzr, #0x3
+  %0 = load double* %in, align 8
+  %call = tail call double @args_f64(double 3.000000e+00, double %0, double %0,
+          double %0, double %0, double %0, double %0, double %0,
+          float 3.000000e+00, double %0, i8 signext 3)
+  ret double %call
+}
+declare double @args_f64(double, double, double, double, double, double, double,
+               double, float, double, i8 signext)
+
+define i64 @test5(i64* nocapture %in) nounwind {
+entry:
+; CHECK: test5
+; CHECK: strb [[REG_3:w[0-9]+]], [sp, #16]
+; CHECK: str [[REG_1:x[0-9]+]], [sp, #8]
+; CHECK: str [[REG_2:w[0-9]+]], [sp]
+  %0 = load i64* %in, align 8
+  %call = tail call i64 @args_i64(i64 3, i64 %0, i64 %0, i64 %0, i64 %0, i64 %0,
+                         i64 %0, i64 %0, i32 3, i64 %0, i8 signext 3)
+  ret i64 %call
+}
+declare i64 @args_i64(i64, i64, i64, i64, i64, i64, i64, i64, i32, i64,
+             i8 signext)
+
+define i32 @test6(float* nocapture %in) nounwind {
+entry:
+; CHECK: test6
+; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
+; CHECK: str [[REG_1:s[0-9]+]], [sp, #4]
+; CHECK: strh [[REG_3:w[0-9]+]], [sp]
+  %0 = load float* %in, align 4
+  %call = tail call i32 @args_f32(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+          i32 7, i32 8, float 1.0, float 2.0, float 3.0, float 4.0, float 5.0,
+          float 6.0, float 7.0, float 8.0, i16 signext 3, float %0,
+          i8 signext 3)
+  ret i32 %call
+}
+declare i32 @args_f32(i32, i32, i32, i32, i32, i32, i32, i32,
+                      float, float, float, float, float, float, float, float,
+                      i16 signext, float, i8 signext)
+
+define i32 @test7(i32* nocapture %in) nounwind {
+entry:
+; CHECK: test7
+; CHECK: strb [[REG_2:w[0-9]+]], [sp, #8]
+; CHECK: str [[REG_1:w[0-9]+]], [sp, #4]
+; CHECK: strh [[REG_3:w[0-9]+]], [sp]
+  %0 = load i32* %in, align 4
+  %call = tail call i32 @args_i32(i32 3, i32 %0, i32 %0, i32 %0, i32 %0, i32 %0,
+                         i32 %0, i32 %0, i16 signext 3, i32 %0, i8 signext 4)
+  ret i32 %call
+}
+declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
+             i8 signext)
+
+define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
+entry:
+; CHECK: test8
+; CHECK: strb {{w[0-9]+}}, [sp, #3]
+; CHECK: strb wzr, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp, #1]
+; CHECK: strb wzr, [sp]
+; CHECK: bl
+; FAST: test8
+; FAST: strb {{w[0-9]+}}, [sp]
+; FAST: strb {{w[0-9]+}}, [sp, #1]
+; FAST: strb {{w[0-9]+}}, [sp, #2]
+; FAST: strb {{w[0-9]+}}, [sp, #3]
+; FAST: bl
+  tail call void @args_i1(i1 zeroext false, i1 zeroext true, i1 zeroext false,
+                  i1 zeroext true, i1 zeroext false, i1 zeroext true,
+                  i1 zeroext false, i1 zeroext true, i1 zeroext false,
+                  i1 zeroext true, i1 zeroext false, i1 zeroext true)
+  ret i32 0
+}
+
+declare void @args_i1(i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
+                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext,
+                      i1 zeroext, i1 zeroext, i1 zeroext, i1 zeroext)
+
+define i32 @i1_stack_incoming(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f,
+                               i64 %g, i64 %h, i64 %i, i1 zeroext %j) {
+; CHECK-LABEL: i1_stack_incoming:
+; CHECK: ldrb w0, [sp, #8]
+; CHECK: ret
+  %v = zext i1 %j to i32
+  ret i32 %v
+}
diff --git a/test/CodeGen/ARM64/abi_align.ll b/test/CodeGen/ARM64/abi_align.ll
new file mode 100644
index 0000000000..61c661e48f
--- /dev/null
+++ b/test/CodeGen/ARM64/abi_align.ll
@@ -0,0 +1,529 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone -enable-misched=false | FileCheck %s
+; RUN: llc < %s -O0 | FileCheck -check-prefix=FAST %s
+target triple = "arm64-apple-darwin"
+
+; rdar://12648441
+; Generated from arm64-arguments.c with -O2.
+; Test passing structs with size < 8, < 16 and > 16
+; with alignment of 16 and without
+
+; Structs with size < 8
+%struct.s38 = type { i32, i16 }
+; With alignment of 16, the size will be padded to multiple of 16 bytes.
+%struct.s39 = type { i32, i16, [10 x i8] }
+; Structs with size < 16
+%struct.s40 = type { i32, i16, i32, i16 }
+%struct.s41 = type { i32, i16, i32, i16 }
+; Structs with size > 16
+%struct.s42 = type { i32, i16, i32, i16, i32, i16 }
+%struct.s43 = type { i32, i16, i32, i16, i32, i16, [10 x i8] }
+
+@g38 = common global %struct.s38 zeroinitializer, align 4
+@g38_2 = common global %struct.s38 zeroinitializer, align 4
+@g39 = common global %struct.s39 zeroinitializer, align 16
+@g39_2 = common global %struct.s39 zeroinitializer, align 16
+@g40 = common global %struct.s40 zeroinitializer, align 4
+@g40_2 = common global %struct.s40 zeroinitializer, align 4
+@g41 = common global %struct.s41 zeroinitializer, align 16
+@g41_2 = common global %struct.s41 zeroinitializer, align 16
+@g42 = common global %struct.s42 zeroinitializer, align 4
+@g42_2 = common global %struct.s42 zeroinitializer, align 4
+@g43 = common global %struct.s43 zeroinitializer, align 16
+@g43_2 = common global %struct.s43 zeroinitializer, align 16
+
+; structs with size < 8 bytes, passed via i64 in x1 and x2
+define i32 @f38(i32 %i, i64 %s1.coerce, i64 %s2.coerce) #0 {
+entry:
+; CHECK: f38
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w2
+  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i64 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i64 %s2.coerce, 32
+  %sext8 = shl nuw nsw i64 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i64 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i64 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i64 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller38() #1 {
+entry:
+; CHECK: caller38
+; CHECK: ldr x1,
+; CHECK: ldr x2,
+  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
+  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
+  %call = tail call i32 @f38(i32 3, i64 %0, i64 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f38_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i64 %s1.coerce, i64 %s2.coerce) #0
+
+; structs with size < 8 bytes, passed on stack at [sp+8] and [sp+16]
+; i9 at [sp]
+define i32 @caller38_stack() #1 {
+entry:
+; CHECK: caller38_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+  %0 = load i64* bitcast (%struct.s38* @g38 to i64*), align 4
+  %1 = load i64* bitcast (%struct.s38* @g38_2 to i64*), align 4
+  %call = tail call i32 @f38_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                                   i32 7, i32 8, i32 9, i64 %0, i64 %1) #5
+  ret i32 %call
+}
+
+; structs with size < 8 bytes, alignment of 16
+; passed via i128 in x1 and x3
+define i32 @f39(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
+entry:
+; CHECK: f39
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
+  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i128 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i128 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller39() #1 {
+entry:
+; CHECK: caller39
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
+  %call = tail call i32 @f39(i32 3, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f39_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
+
+; structs with size < 8 bytes, alignment 16
+; passed on stack at [sp+16] and [sp+32]
+define i32 @caller39_stack() #1 {
+entry:
+; CHECK: caller39_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+  %0 = load i128* bitcast (%struct.s39* @g39 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s39* @g39_2 to i128*), align 16
+  %call = tail call i32 @f39_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                                   i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+; structs with size < 16 bytes
+; passed via i128 in x1 and x3
+define i32 @f40(i32 %i, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0 {
+entry:
+; CHECK: f40
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.coerce.fca.0.extract = extractvalue [2 x i64] %s1.coerce, 0
+  %s2.coerce.fca.0.extract = extractvalue [2 x i64] %s2.coerce, 0
+  %s1.sroa.0.0.extract.trunc = trunc i64 %s1.coerce.fca.0.extract to i32
+  %s2.sroa.0.0.extract.trunc = trunc i64 %s2.coerce.fca.0.extract to i32
+  %s1.sroa.0.4.extract.shift = lshr i64 %s1.coerce.fca.0.extract, 32
+  %sext8 = shl nuw nsw i64 %s1.sroa.0.4.extract.shift, 16
+  %sext = trunc i64 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %s2.sroa.0.4.extract.shift = lshr i64 %s2.coerce.fca.0.extract, 32
+  %sext1011 = shl nuw nsw i64 %s2.sroa.0.4.extract.shift, 16
+  %sext10 = trunc i64 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller40() #1 {
+entry:
+; CHECK: caller40
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
+  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
+  %call = tail call i32 @f40(i32 3, [2 x i64] %0, [2 x i64] %1) #5
+  ret i32 %call
+}
+
+declare i32 @f40_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, [2 x i64] %s1.coerce, [2 x i64] %s2.coerce) #0
+
+; structs with size < 16 bytes
+; passed on stack at [sp+8] and [sp+24]
+define i32 @caller40_stack() #1 {
+entry:
+; CHECK: caller40_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #24]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+  %0 = load [2 x i64]* bitcast (%struct.s40* @g40 to [2 x i64]*), align 4
+  %1 = load [2 x i64]* bitcast (%struct.s40* @g40_2 to [2 x i64]*), align 4
+  %call = tail call i32 @f40_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                         i32 7, i32 8, i32 9, [2 x i64] %0, [2 x i64] %1) #5
+  ret i32 %call
+}
+
+; structs with size < 16 bytes, alignment of 16
+; passed via i128 in x1 and x3
+define i32 @f41(i32 %i, i128 %s1.coerce, i128 %s2.coerce) #0 {
+entry:
+; CHECK: f41
+; CHECK: add w[[A:[0-9]+]], w1, w0
+; CHECK: add {{w[0-9]+}}, w[[A]], w3
+  %s1.sroa.0.0.extract.trunc = trunc i128 %s1.coerce to i32
+  %s1.sroa.1.4.extract.shift = lshr i128 %s1.coerce, 32
+  %s2.sroa.0.0.extract.trunc = trunc i128 %s2.coerce to i32
+  %s2.sroa.1.4.extract.shift = lshr i128 %s2.coerce, 32
+  %sext8 = shl nuw nsw i128 %s1.sroa.1.4.extract.shift, 16
+  %sext = trunc i128 %sext8 to i32
+  %conv = ashr exact i32 %sext, 16
+  %sext1011 = shl nuw nsw i128 %s2.sroa.1.4.extract.shift, 16
+  %sext10 = trunc i128 %sext1011 to i32
+  %conv6 = ashr exact i32 %sext10, 16
+  %add = add i32 %s1.sroa.0.0.extract.trunc, %i
+  %add3 = add i32 %add, %s2.sroa.0.0.extract.trunc
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller41() #1 {
+entry:
+; CHECK: caller41
+; CHECK: ldp x1, x2,
+; CHECK: ldp x3, x4,
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
+  %call = tail call i32 @f41(i32 3, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+declare i32 @f41_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                i32 %i7, i32 %i8, i32 %i9, i128 %s1.coerce, i128 %s2.coerce) #0
+
+; structs with size < 16 bytes, alignment of 16
+; passed on stack at [sp+16] and [sp+32]
+define i32 @caller41_stack() #1 {
+entry:
+; CHECK: caller41_stack
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #32]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #16]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %1 = load i128* bitcast (%struct.s41* @g41_2 to i128*), align 16
+  %call = tail call i32 @f41_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6,
+                            i32 7, i32 8, i32 9, i128 %0, i128 %1) #5
+  ret i32 %call
+}
+
+; structs with size of 22 bytes, passed indirectly in x1 and x2
+define i32 @f42(i32 %i, %struct.s42* nocapture %s1, %struct.s42* nocapture %s2) #2 {
+entry:
+; CHECK: f42
+; CHECK: ldr w[[A:[0-9]+]], [x1]
+; CHECK: ldr w[[B:[0-9]+]], [x2]
+; CHECK: add w[[C:[0-9]+]], w[[A]], w0
+; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
+; FAST: f42
+; FAST: ldr w[[A:[0-9]+]], [x1]
+; FAST: ldr w[[B:[0-9]+]], [x2]
+; FAST: add w[[C:[0-9]+]], w[[A]], w0
+; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
+  %i1 = getelementptr inbounds %struct.s42* %s1, i64 0, i32 0
+  %0 = load i32* %i1, align 4, !tbaa !0
+  %i2 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 0
+  %1 = load i32* %i2, align 4, !tbaa !0
+  %s = getelementptr inbounds %struct.s42* %s1, i64 0, i32 1
+  %2 = load i16* %s, align 2, !tbaa !3
+  %conv = sext i16 %2 to i32
+  %s5 = getelementptr inbounds %struct.s42* %s2, i64 0, i32 1
+  %3 = load i16* %s5, align 2, !tbaa !3
+  %conv6 = sext i16 %3 to i32
+  %add = add i32 %0, %i
+  %add3 = add i32 %add, %1
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+; For s1, we allocate a 22-byte space, pass its address via x1
+define i32 @caller42() #3 {
+entry:
+; CHECK: caller42
+; CHECK: str {{x[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK: str {{x[0-9]+}}, [sp, #16]
+; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK: add x1, sp, #32
+; CHECK: mov x2, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+
+; FAST: caller42
+; FAST: sub sp, sp, #96
+; Space for s1 is allocated at fp-24 = sp+72
+; Space for s2 is allocated at sp+48
+; FAST: sub x[[A:[0-9]+]], fp, #24
+; FAST: add x[[A:[0-9]+]], sp, #48
+; Call memcpy with size = 24 (0x18)
+; FAST: orr {{x[0-9]+}}, xzr, #0x18
+  %tmp = alloca %struct.s42, align 4
+  %tmp1 = alloca %struct.s42, align 4
+  %0 = bitcast %struct.s42* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s42* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %call = call i32 @f42(i32 3, %struct.s42* %tmp, %struct.s42* %tmp1) #5
+  ret i32 %call
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) #4
+
+declare i32 @f42_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                       i32 %i7, i32 %i8, i32 %i9, %struct.s42* nocapture %s1,
+                       %struct.s42* nocapture %s2) #2
+
+define i32 @caller42_stack() #3 {
+entry:
+; CHECK: caller42_stack
+; CHECK: mov fp, sp
+; CHECK: sub sp, sp, #96
+; CHECK: stur {{x[0-9]+}}, [fp, #-16]
+; CHECK: stur {{q[0-9]+}}, [fp, #-32]
+; CHECK: str {{x[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; Space for s1 is allocated at fp-32 = sp+64
+; Space for s2 is allocated at sp+32
+; CHECK: add x[[B:[0-9]+]], sp, #32
+; CHECK: str x[[B]], [sp, #16]
+; CHECK: sub x[[A:[0-9]+]], fp, #32
+; Address of s1 is passed on stack at sp+8
+; CHECK: str x[[A]], [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+
+; FAST: caller42_stack
+; Space for s1 is allocated at fp-24
+; Space for s2 is allocated at fp-48
+; FAST: sub x[[A:[0-9]+]], fp, #24
+; FAST: sub x[[B:[0-9]+]], fp, #48
+; Call memcpy with size = 24 (0x18)
+; FAST: orr {{x[0-9]+}}, xzr, #0x18
+; FAST: str {{w[0-9]+}}, [sp]
+; Address of s1 is passed on stack at sp+8
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+  %tmp = alloca %struct.s42, align 4
+  %tmp1 = alloca %struct.s42, align 4
+  %0 = bitcast %struct.s42* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s42* @g42 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s42* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s42* @g42_2 to i8*), i64 24, i32 4, i1 false), !tbaa.struct !4
+  %call = call i32 @f42_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, %struct.s42* %tmp, %struct.s42* %tmp1) #5
+  ret i32 %call
+}
+
+; structs with size of 22 bytes, alignment of 16
+; passed indirectly in x1 and x2
+define i32 @f43(i32 %i, %struct.s43* nocapture %s1, %struct.s43* nocapture %s2) #2 {
+entry:
+; CHECK: f43
+; CHECK: ldr w[[A:[0-9]+]], [x1]
+; CHECK: ldr w[[B:[0-9]+]], [x2]
+; CHECK: add w[[C:[0-9]+]], w[[A]], w0
+; CHECK: add {{w[0-9]+}}, w[[C]], w[[B]]
+; FAST: f43
+; FAST: ldr w[[A:[0-9]+]], [x1]
+; FAST: ldr w[[B:[0-9]+]], [x2]
+; FAST: add w[[C:[0-9]+]], w[[A]], w0
+; FAST: add {{w[0-9]+}}, w[[C]], w[[B]]
+  %i1 = getelementptr inbounds %struct.s43* %s1, i64 0, i32 0
+  %0 = load i32* %i1, align 4, !tbaa !0
+  %i2 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 0
+  %1 = load i32* %i2, align 4, !tbaa !0
+  %s = getelementptr inbounds %struct.s43* %s1, i64 0, i32 1
+  %2 = load i16* %s, align 2, !tbaa !3
+  %conv = sext i16 %2 to i32
+  %s5 = getelementptr inbounds %struct.s43* %s2, i64 0, i32 1
+  %3 = load i16* %s5, align 2, !tbaa !3
+  %conv6 = sext i16 %3 to i32
+  %add = add i32 %0, %i
+  %add3 = add i32 %add, %1
+  %add4 = add i32 %add3, %conv
+  %add7 = add i32 %add4, %conv6
+  ret i32 %add7
+}
+
+define i32 @caller43() #3 {
+entry:
+; CHECK: caller43
+; CHECK: str {{q[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; CHECK: str {{q[0-9]+}}, [sp, #16]
+; CHECK: str {{q[0-9]+}}, [sp]
+; CHECK: add x1, sp, #32
+; CHECK: mov x2, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+
+; FAST: caller43
+; FAST: mov fp, sp
+; Space for s1 is allocated at sp+32
+; Space for s2 is allocated at sp
+; FAST: add x1, sp, #32
+; FAST: mov x2, sp
+; FAST: str {{x[0-9]+}}, [sp, #32]
+; FAST: str {{x[0-9]+}}, [sp, #40]
+; FAST: str {{x[0-9]+}}, [sp, #48]
+; FAST: str {{x[0-9]+}}, [sp, #56]
+; FAST: str {{x[0-9]+}}, [sp]
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+; FAST: str {{x[0-9]+}}, [sp, #24]
+  %tmp = alloca %struct.s43, align 16
+  %tmp1 = alloca %struct.s43, align 16
+  %0 = bitcast %struct.s43* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s43* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %call = call i32 @f43(i32 3, %struct.s43* %tmp, %struct.s43* %tmp1) #5
+  ret i32 %call
+}
+
+declare i32 @f43_stack(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6,
+                       i32 %i7, i32 %i8, i32 %i9, %struct.s43* nocapture %s1,
+                       %struct.s43* nocapture %s2) #2
+
+define i32 @caller43_stack() #3 {
+entry:
+; CHECK: caller43_stack
+; CHECK: mov fp, sp
+; CHECK: sub sp, sp, #96
+; CHECK: stur {{q[0-9]+}}, [fp, #-16]
+; CHECK: stur {{q[0-9]+}}, [fp, #-32]
+; CHECK: str {{q[0-9]+}}, [sp, #48]
+; CHECK: str {{q[0-9]+}}, [sp, #32]
+; Space for s1 is allocated at fp-32 = sp+64
+; Space for s2 is allocated at sp+32
+; CHECK: add x[[B:[0-9]+]], sp, #32
+; CHECK: str x[[B]], [sp, #16]
+; CHECK: sub x[[A:[0-9]+]], fp, #32
+; Address of s1 is passed on stack at sp+8
+; CHECK: str x[[A]], [sp, #8]
+; CHECK: movz w[[C:[0-9]+]], #9
+; CHECK: str w[[C]], [sp]
+
+; FAST: caller43_stack
+; FAST: sub sp, sp, #96
+; Space for s1 is allocated at fp-32 = sp+64
+; Space for s2 is allocated at sp+32
+; FAST: sub x[[A:[0-9]+]], fp, #32
+; FAST: add x[[B:[0-9]+]], sp, #32
+; FAST: stur {{x[0-9]+}}, [fp, #-32]
+; FAST: stur {{x[0-9]+}}, [fp, #-24]
+; FAST: stur {{x[0-9]+}}, [fp, #-16]
+; FAST: stur {{x[0-9]+}}, [fp, #-8]
+; FAST: str {{x[0-9]+}}, [sp, #32]
+; FAST: str {{x[0-9]+}}, [sp, #40]
+; FAST: str {{x[0-9]+}}, [sp, #48]
+; FAST: str {{x[0-9]+}}, [sp, #56]
+; FAST: str {{w[0-9]+}}, [sp]
+; Address of s1 is passed on stack at sp+8
+; FAST: str {{x[0-9]+}}, [sp, #8]
+; FAST: str {{x[0-9]+}}, [sp, #16]
+  %tmp = alloca %struct.s43, align 16
+  %tmp1 = alloca %struct.s43, align 16
+  %0 = bitcast %struct.s43* %tmp to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.s43* @g43 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %1 = bitcast %struct.s43* %tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s43* @g43_2 to i8*), i64 32, i32 16, i1 false), !tbaa.struct !4
+  %call = call i32 @f43_stack(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                       i32 8, i32 9, %struct.s43* %tmp, %struct.s43* %tmp1) #5
+  ret i32 %call
+}
+
+; rdar://13668927
+; Check that we don't split an i128.
+declare i32 @callee_i128_split(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
+                               i32 %i6, i32 %i7, i128 %s1, i32 %i8)
+
+define i32 @i128_split() {
+entry:
+; CHECK: i128_split
+; "i128 %0" should be on stack at [sp].
+; "i32 8" should be on stack at [sp, #16].
+; CHECK: str {{w[0-9]+}}, [sp, #16]
+; CHECK: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp]
+; FAST: i128_split
+; FAST: mov x[[ADDR:[0-9]+]], sp
+; FAST: str {{w[0-9]+}}, [x[[ADDR]], #16]
+; FAST: stp {{x[0-9]+}}, {{x[0-9]+}}, [x[[ADDR]]]
+  %0 = load i128* bitcast (%struct.s41* @g41 to i128*), align 16
+  %call = tail call i32 @callee_i128_split(i32 1, i32 2, i32 3, i32 4, i32 5,
+                                           i32 6, i32 7, i128 %0, i32 8) #5
+  ret i32 %call
+}
+
+declare i32 @callee_i64(i32 %i, i32 %i2, i32 %i3, i32 %i4, i32 %i5,
+                               i32 %i6, i32 %i7, i64 %s1, i32 %i8)
+
+define i32 @i64_split() {
+entry:
+; CHECK: i64_split
+; "i64 %0" should be in register x7.
+; "i32 8" should be on stack at [sp].
+; CHECK: ldr x7, [{{x[0-9]+}}]
+; CHECK: str {{w[0-9]+}}, [sp]
+; FAST: i64_split
+; FAST: ldr x7, [{{x[0-9]+}}]
+; FAST: str {{w[0-9]+}}, [sp]
+  %0 = load i64* bitcast (%struct.s41* @g41 to i64*), align 16
+  %call = tail call i32 @callee_i64(i32 1, i32 2, i32 3, i32 4, i32 5,
+                                    i32 6, i32 7, i64 %0, i32 8) #5
+  ret i32 %call
+}
+
+attributes #0 = { noinline nounwind readnone "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #1 = { nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #2 = { noinline nounwind readonly "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #3 = { nounwind "fp-contract-model"="standard" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #4 = { nounwind }
+attributes #5 = { nobuiltin }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"short", metadata !1}
+!4 = metadata !{i64 0, i64 4, metadata !0, i64 4, i64 2, metadata !3, i64 8, i64 4, metadata !0, i64 12, i64 2, metadata !3, i64 16, i64 4, metadata !0, i64 20, i64 2, metadata !3}
diff --git a/test/CodeGen/ARM64/addp.ll b/test/CodeGen/ARM64/addp.ll
new file mode 100644
index 0000000000..8283a0005c
--- /dev/null
+++ b/test/CodeGen/ARM64/addp.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define double @foo(<2 x double> %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: faddp.2d d0, v0
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x double> %a, i32 0
+  %lane1.i = extractelement <2 x double> %a, i32 1
+  %vpaddd.i = fadd double %lane0.i, %lane1.i
+  ret double %vpaddd.i
+}
+
+define i64 @foo0(<2 x i64> %a) nounwind {
+; CHECK-LABEL: foo0:
+; CHECK: addp.2d d0, v0
+; CHECK-NEXT: fmov x0, d0
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x i64> %a, i32 0
+  %lane1.i = extractelement <2 x i64> %a, i32 1
+  %vpaddd.i = add i64 %lane0.i, %lane1.i
+  ret i64 %vpaddd.i
+}
+
+define float @foo1(<2 x float> %a) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: faddp.2s
+; CHECK-NEXT: ret
+  %lane0.i = extractelement <2 x float> %a, i32 0
+  %lane1.i = extractelement <2 x float> %a, i32 1
+  %vpaddd.i = fadd float %lane0.i, %lane1.i
+  ret float %vpaddd.i
+}
diff --git a/test/CodeGen/ARM64/addr-mode-folding.ll b/test/CodeGen/ARM64/addr-mode-folding.ll
new file mode 100644
index 0000000000..dff2331d29
--- /dev/null
+++ b/test/CodeGen/ARM64/addr-mode-folding.ll
@@ -0,0 +1,171 @@
+; RUN: llc -O3 -mtriple arm64-apple-ios3 %s -o - | FileCheck %s
+; <rdar://problem/13621857>
+
+@block = common global i8* null, align 8
+
+define i32 @fct(i32 %i1, i32 %i2) {
+; CHECK: @fct
+; Sign extension is used more than once, thus it should not be folded.
+; CodeGenPrepare is not sharing sext accross uses, thus this is folded because
+; of that.
+; _CHECK-NOT_: , sxtw]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv8 = zext i1 %cmp7 to i32
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc9 = add nsw i32 %i2, 1
+  %idxprom10 = sext i32 %inc to i64
+  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
+  %3 = load i8* %arrayidx11, align 1
+  %idxprom12 = sext i32 %inc9 to i64
+  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
+  %4 = load i8* %arrayidx13, align 1
+  %cmp16 = icmp eq i8 %3, %4
+  br i1 %cmp16, label %if.end23, label %if.then18
+
+if.then18:                                        ; preds = %if.end
+  %cmp21 = icmp ugt i8 %3, %4
+  %conv22 = zext i1 %cmp21 to i32
+  br label %return
+
+if.end23:                                         ; preds = %if.end
+  %inc24 = add nsw i32 %i1, 2
+  %inc25 = add nsw i32 %i2, 2
+  %idxprom26 = sext i32 %inc24 to i64
+  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
+  %5 = load i8* %arrayidx27, align 1
+  %idxprom28 = sext i32 %inc25 to i64
+  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
+  %6 = load i8* %arrayidx29, align 1
+  %cmp32 = icmp eq i8 %5, %6
+  br i1 %cmp32, label %return, label %if.then34
+
+if.then34:                                        ; preds = %if.end23
+  %cmp37 = icmp ugt i8 %5, %6
+  %conv38 = zext i1 %cmp37 to i32
+  br label %return
+
+return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
+  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
+  ret i32 %retval.0
+}
+
+define i32 @fct1(i32 %i1, i32 %i2) optsize {
+; CHECK: @fct1
+; Addressing are folded when optimizing for code size.
+; CHECK: , sxtw]
+; CHECK: , sxtw]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv8 = zext i1 %cmp7 to i32
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc9 = add nsw i32 %i2, 1
+  %idxprom10 = sext i32 %inc to i64
+  %arrayidx11 = getelementptr inbounds i8* %0, i64 %idxprom10
+  %3 = load i8* %arrayidx11, align 1
+  %idxprom12 = sext i32 %inc9 to i64
+  %arrayidx13 = getelementptr inbounds i8* %0, i64 %idxprom12
+  %4 = load i8* %arrayidx13, align 1
+  %cmp16 = icmp eq i8 %3, %4
+  br i1 %cmp16, label %if.end23, label %if.then18
+
+if.then18:                                        ; preds = %if.end
+  %cmp21 = icmp ugt i8 %3, %4
+  %conv22 = zext i1 %cmp21 to i32
+  br label %return
+
+if.end23:                                         ; preds = %if.end
+  %inc24 = add nsw i32 %i1, 2
+  %inc25 = add nsw i32 %i2, 2
+  %idxprom26 = sext i32 %inc24 to i64
+  %arrayidx27 = getelementptr inbounds i8* %0, i64 %idxprom26
+  %5 = load i8* %arrayidx27, align 1
+  %idxprom28 = sext i32 %inc25 to i64
+  %arrayidx29 = getelementptr inbounds i8* %0, i64 %idxprom28
+  %6 = load i8* %arrayidx29, align 1
+  %cmp32 = icmp eq i8 %5, %6
+  br i1 %cmp32, label %return, label %if.then34
+
+if.then34:                                        ; preds = %if.end23
+  %cmp37 = icmp ugt i8 %5, %6
+  %conv38 = zext i1 %cmp37 to i32
+  br label %return
+
+return:                                           ; preds = %if.end23, %if.then34, %if.then18, %if.then
+  %retval.0 = phi i32 [ %conv8, %if.then ], [ %conv22, %if.then18 ], [ %conv38, %if.then34 ], [ 1, %if.end23 ]
+  ret i32 %retval.0
+}
+
+; CHECK: @test
+; CHECK-NOT: , uxtw #2]
+define i32 @test(i32* %array, i8 zeroext %c, i32 %arg) {
+entry:
+  %conv = zext i8 %c to i32
+  %add = sub i32 0, %arg
+  %tobool = icmp eq i32 %conv, %add
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = zext i8 %c to i64
+  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
+  %0 = load volatile i32* %arrayidx, align 4
+  %1 = load volatile i32* %arrayidx, align 4
+  %add3 = add nsw i32 %1, %0
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
+  ret i32 %res.0
+}
+
+
+; CHECK: @test2
+; CHECK: , uxtw #2]
+; CHECK: , uxtw #2]
+define i32 @test2(i32* %array, i8 zeroext %c, i32 %arg) optsize {
+entry:
+  %conv = zext i8 %c to i32
+  %add = sub i32 0, %arg
+  %tobool = icmp eq i32 %conv, %add
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %idxprom = zext i8 %c to i64
+  %arrayidx = getelementptr inbounds i32* %array, i64 %idxprom
+  %0 = load volatile i32* %arrayidx, align 4
+  %1 = load volatile i32* %arrayidx, align 4
+  %add3 = add nsw i32 %1, %0
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %res.0 = phi i32 [ %add3, %if.then ], [ 0, %entry ]
+  ret i32 %res.0
+}
diff --git a/test/CodeGen/ARM64/addr-type-promotion.ll b/test/CodeGen/ARM64/addr-type-promotion.ll
new file mode 100644
index 0000000000..0677603473
--- /dev/null
+++ b/test/CodeGen/ARM64/addr-type-promotion.ll
@@ -0,0 +1,82 @@
+; RUN: llc -march arm64 < %s | FileCheck %s
+; rdar://13452552
+; ModuleID = 'reduced_test.ll'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+@block = common global i8* null, align 8
+
+define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
+; CHECK: fullGtU
+; CHECK: adrp [[PAGE:x[0-9]+]], _block@GOTPAGE
+; CHECK: ldr [[ADDR:x[0-9]+]], {{\[}}[[PAGE]], _block@GOTPAGEOFF]
+; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]]
+; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]],  x0, sxtw]
+; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], x1, sxtw]
+; CHECK-NEXT cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
+; CHECK-NEXT b.ne
+; Next BB
+; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw
+; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw
+; CHECK-NEXT: ldrb [[LOADEDVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #1]
+; CHECK-NEXT: ldrb [[LOADEDVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #1]
+; CHECK-NEXT: cmp [[LOADEDVAL1]], [[LOADEDVAL2]]
+; CHECK-NEXT: b.ne
+; Next BB
+; CHECK: ldrb [[LOADEDVAL3:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #2]
+; CHECK-NEXT: ldrb [[LOADEDVAL4:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #2]
+; CHECK-NEXT: cmp [[LOADEDVAL3]], [[LOADEDVAL4]]
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %tmp = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %tmp, i64 %idxprom
+  %tmp1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %tmp, i64 %idxprom1
+  %tmp2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %tmp1, %tmp2
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %tmp1, %tmp2
+  %conv9 = zext i1 %cmp7 to i8
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %inc = add nsw i32 %i1, 1
+  %inc10 = add nsw i32 %i2, 1
+  %idxprom11 = sext i32 %inc to i64
+  %arrayidx12 = getelementptr inbounds i8* %tmp, i64 %idxprom11
+  %tmp3 = load i8* %arrayidx12, align 1
+  %idxprom13 = sext i32 %inc10 to i64
+  %arrayidx14 = getelementptr inbounds i8* %tmp, i64 %idxprom13
+  %tmp4 = load i8* %arrayidx14, align 1
+  %cmp17 = icmp eq i8 %tmp3, %tmp4
+  br i1 %cmp17, label %if.end25, label %if.then19
+
+if.then19:                                        ; preds = %if.end
+  %cmp22 = icmp ugt i8 %tmp3, %tmp4
+  %conv24 = zext i1 %cmp22 to i8
+  br label %return
+
+if.end25:                                         ; preds = %if.end
+  %inc26 = add nsw i32 %i1, 2
+  %inc27 = add nsw i32 %i2, 2
+  %idxprom28 = sext i32 %inc26 to i64
+  %arrayidx29 = getelementptr inbounds i8* %tmp, i64 %idxprom28
+  %tmp5 = load i8* %arrayidx29, align 1
+  %idxprom30 = sext i32 %inc27 to i64
+  %arrayidx31 = getelementptr inbounds i8* %tmp, i64 %idxprom30
+  %tmp6 = load i8* %arrayidx31, align 1
+  %cmp34 = icmp eq i8 %tmp5, %tmp6
+  br i1 %cmp34, label %return, label %if.then36
+
+if.then36:                                        ; preds = %if.end25
+  %cmp39 = icmp ugt i8 %tmp5, %tmp6
+  %conv41 = zext i1 %cmp39 to i8
+  br label %return
+
+return:                                           ; preds = %if.then36, %if.end25, %if.then19, %if.then
+  %retval.0 = phi i8 [ %conv9, %if.then ], [ %conv24, %if.then19 ], [ %conv41, %if.then36 ], [ 0, %if.end25 ]
+  ret i8 %retval.0
+}
diff --git a/test/CodeGen/ARM64/addrmode.ll b/test/CodeGen/ARM64/addrmode.ll
new file mode 100644
index 0000000000..e1312376e2
--- /dev/null
+++ b/test/CodeGen/ARM64/addrmode.ll
@@ -0,0 +1,72 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+; rdar://10232252
+
+@object = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+; base + offset (imm9)
+; CHECK: @t1
+; CHECK: ldr xzr, [x{{[0-9]+}}, #8]
+; CHECK: ret
+define void @t1() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 1
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + offset (> imm9)
+; CHECK: @t2
+; CHECK: sub [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #264
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t2() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 -33
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + unsigned offset (> imm9 and <= imm12 * size of type in bytes)
+; CHECK: @t3
+; CHECK: ldr xzr, [x{{[0-9]+}}, #32760]
+; CHECK: ret
+define void @t3() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 4095
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + unsigned offset (> imm12 * size of type in bytes)
+; CHECK: @t4
+; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, #32768
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t4() {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 4096
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + reg
+; CHECK: @t5
+; CHECK: ldr xzr, [x{{[0-9]+}}, x{{[0-9]+}}, lsl #3]
+; CHECK: ret
+define void @t5(i64 %a) {
+  %incdec.ptr = getelementptr inbounds i64* @object, i64 %a
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
+
+; base + reg + imm
+; CHECK: @t6
+; CHECK: add [[ADDREG:x[0-9]+]], x{{[0-9]+}}, x{{[0-9]+}}, lsl #3
+; CHECK-NEXT: add [[ADDREG]], [[ADDREG]], #32768
+; CHECK: ldr xzr, [
+; CHECK: [[ADDREG]]]
+; CHECK: ret
+define void @t6(i64 %a) {
+  %tmp1 = getelementptr inbounds i64* @object, i64 %a
+  %incdec.ptr = getelementptr inbounds i64* %tmp1, i64 4096
+  %tmp = load volatile i64* %incdec.ptr, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM64/alloc-no-stack-realign.ll b/test/CodeGen/ARM64/alloc-no-stack-realign.ll
new file mode 100644
index 0000000000..f396bc9917
--- /dev/null
+++ b/test/CodeGen/ARM64/alloc-no-stack-realign.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=false | FileCheck %s
+
+; rdar://12713765
+; Make sure we are not creating stack objects that are assumed to be 64-byte
+; aligned.
+@T3_retval = common global <16 x float> zeroinitializer, align 16
+
+define void @test(<16 x float>* noalias sret %agg.result) nounwind ssp {
+entry:
+; CHECK: test
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp, #32]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], [sp]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE:x[0-9]+]], #32]
+; CHECK: stp [[Q1:q[0-9]+]], [[Q2:q[0-9]+]], {{\[}}[[BASE]]]
+ %retval = alloca <16 x float>, align 16
+ %0 = load <16 x float>* @T3_retval, align 16
+ store <16 x float> %0, <16 x float>* %retval
+ %1 = load <16 x float>* %retval
+ store <16 x float> %1, <16 x float>* %agg.result, align 16
+ ret void
+}
diff --git a/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll b/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
new file mode 100644
index 0000000000..3750f31b37
--- /dev/null
+++ b/test/CodeGen/ARM64/alloca-frame-pointer-offset.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s
+
+; CHECK: foo
+; CHECK: ldr w[[REG:[0-9]+]], [x19, #264]
+; CHECK: str w[[REG]], [x19, #132]
+; CHECK: ldr w{{[0-9]+}}, [x19, #264]
+
+define i32 @foo(i32 %a) nounwind {
+  %retval = alloca i32, align 4
+  %a.addr = alloca i32, align 4
+  %arr = alloca [32 x i32], align 4
+  %i = alloca i32, align 4
+  %arr2 = alloca [32 x i32], align 4
+  %j = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %tmp1 = zext i32 %tmp to i64
+  %v = mul i64 4, %tmp1
+  %vla = alloca i8, i64 %v, align 4
+  %tmp2 = bitcast i8* %vla to i32*
+  %tmp3 = load i32* %a.addr, align 4
+  store i32 %tmp3, i32* %i, align 4
+  %tmp4 = load i32* %a.addr, align 4
+  store i32 %tmp4, i32* %j, align 4
+  %tmp5 = load i32* %j, align 4
+  store i32 %tmp5, i32* %retval
+  %x = load i32* %retval
+  ret i32 %x
+}
diff --git a/test/CodeGen/ARM64/andCmpBrToTBZ.ll b/test/CodeGen/ARM64/andCmpBrToTBZ.ll
new file mode 100644
index 0000000000..419497722f
--- /dev/null
+++ b/test/CodeGen/ARM64/andCmpBrToTBZ.ll
@@ -0,0 +1,72 @@
+; RUN: llc -O1 -march=arm64 -enable-andcmp-sinking=true < %s | FileCheck %s
+; ModuleID = 'and-cbz-extr-mr.bc'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+define zeroext i1 @foo(i1 %IsEditable, i1 %isTextField, i8* %str1, i8* %str2, i8* %str3, i8* %str4, i8* %str5, i8* %str6, i8* %str7, i8* %str8, i8* %str9, i8* %str10, i8* %str11, i8* %str12, i8* %str13, i32 %int1, i8* %str14) unnamed_addr #0 align 2 {
+; CHECK: _foo:
+entry:
+  %tobool = icmp eq i8* %str14, null
+  br i1 %tobool, label %return, label %if.end
+
+; CHECK: %if.end
+; CHECK: tbz
+if.end:                                           ; preds = %entry
+  %and.i.i.i = and i32 %int1, 4
+  %tobool.i.i.i = icmp eq i32 %and.i.i.i, 0
+  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i
+
+land.rhs.i:                                       ; preds = %if.end
+  %cmp.i.i.i = icmp eq i8* %str12, %str13
+  br i1 %cmp.i.i.i, label %if.then3, label %lor.rhs.i.i.i
+
+lor.rhs.i.i.i:                                    ; preds = %land.rhs.i
+  %cmp.i13.i.i.i = icmp eq i8* %str10, %str11
+  br i1 %cmp.i13.i.i.i, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, label %if.end5
+
+_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit: ; preds = %lor.rhs.i.i.i
+  %cmp.i.i.i.i = icmp eq i8* %str8, %str9
+  br i1 %cmp.i.i.i.i, label %if.then3, label %if.end5
+
+if.then3:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %land.rhs.i
+  %tmp11 = load i8* %str14, align 8
+  %tmp12 = and i8 %tmp11, 2
+  %tmp13 = icmp ne i8 %tmp12, 0
+  br label %return
+
+if.end5:                                          ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit, %lor.rhs.i.i.i
+; CHECK: %if.end5
+; CHECK: tbz
+  br i1 %tobool.i.i.i, label %if.end12, label %land.rhs.i19
+
+land.rhs.i19:                                     ; preds = %if.end5
+  %cmp.i.i.i18 = icmp eq i8* %str6, %str7
+  br i1 %cmp.i.i.i18, label %if.then7, label %lor.rhs.i.i.i23
+
+lor.rhs.i.i.i23:                                  ; preds = %land.rhs.i19
+  %cmp.i13.i.i.i22 = icmp eq i8* %str3, %str4
+  br i1 %cmp.i13.i.i.i22, label %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, label %if.end12
+
+_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28: ; preds = %lor.rhs.i.i.i23
+  %cmp.i.i.i.i26 = icmp eq i8* %str1, %str2
+  br i1 %cmp.i.i.i.i26, label %if.then7, label %if.end12
+
+if.then7:                                         ; preds = %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %land.rhs.i19
+  br i1 %isTextField, label %if.then9, label %if.end12
+
+if.then9:                                         ; preds = %if.then7
+  %tmp23 = load i8* %str5, align 8
+  %tmp24 = and i8 %tmp23, 2
+  %tmp25 = icmp ne i8 %tmp24, 0
+  br label %return
+
+if.end12:                                         ; preds = %if.then7, %_ZNK7WebCore4Node10hasTagNameERKNS_13QualifiedNameE.exit28, %lor.rhs.i.i.i23, %if.end5, %if.end
+  %lnot = xor i1 %IsEditable, true
+  br label %return
+
+return:                                           ; preds = %if.end12, %if.then9, %if.then3, %entry
+  %retval.0 = phi i1 [ %tmp13, %if.then3 ], [ %tmp25, %if.then9 ], [ %lnot, %if.end12 ], [ true, %entry ]
+  ret i1 %retval.0
+}
+
+attributes #0 = { nounwind ssp }
diff --git a/test/CodeGen/ARM64/anyregcc-crash.ll b/test/CodeGen/ARM64/anyregcc-crash.ll
new file mode 100644
index 0000000000..241cf974c0
--- /dev/null
+++ b/test/CodeGen/ARM64/anyregcc-crash.ll
@@ -0,0 +1,19 @@
+; RUN: not llc < %s -mtriple=arm64-apple-darwin 2>&1 | FileCheck %s
+;
+; Check that misuse of anyregcc results in a compile time error.
+
+; CHECK: LLVM ERROR: ran out of registers during register allocation
+define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+                        i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+                        i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+                        i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32) {
+entry:
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 32,
+                i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7, i64 %v8,
+                i64 %v9, i64 %v10, i64 %v11, i64 %v12, i64 %v13, i64 %v14, i64 %v15, i64 %v16,
+                i64 %v17, i64 %v18, i64 %v19, i64 %v20, i64 %v21, i64 %v22, i64 %v23, i64 %v24,
+                i64 %v25, i64 %v26, i64 %v27, i64 %v28, i64 %v29, i64 %v30, i64 %v31, i64 %v32)
+  ret i64 %result
+}
+
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/anyregcc.ll b/test/CodeGen/ARM64/anyregcc.ll
new file mode 100644
index 0000000000..9e22c5ae18
--- /dev/null
+++ b/test/CodeGen/ARM64/anyregcc.ll
@@ -0,0 +1,358 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Stackmap Header: no constants - 6 callsites
+; CHECK-LABEL: .section	__LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .long   0
+; Num Functions
+; CHECK-NEXT:   .long 8
+; CHECK-NEXT:   .long _test
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _property_access1
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _property_access2
+; CHECK-NEXT:   .long 32
+; CHECK-NEXT:   .long _property_access3
+; CHECK-NEXT:   .long 32
+; CHECK-NEXT:   .long _anyreg_test1
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _anyreg_test2
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _patchpoint_spilldef
+; CHECK-NEXT:   .long 112
+; CHECK-NEXT:   .long _patchpoint_spillargs
+; CHECK-NEXT:   .long 128
+; Num Constants
+; CHECK-NEXT:   .long   0
+; Num Callsites
+; CHECK-NEXT:   .long   8
+
+; test
+; CHECK-LABEL:  .long   L{{.*}}-_test
+; CHECK-NEXT:   .short  0
+; 3 locations
+; CHECK-NEXT:   .short  3
+; Loc 0: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Constant 3
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long 3
+define i64 @test() nounwind ssp uwtable {
+entry:
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 16, i8* null, i32 2, i32 1, i32 2, i64 3)
+  ret i64 0
+}
+
+; property access 1 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL:  .long   L{{.*}}-_property_access1
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 20, i8* %f, i32 1, i8* %obj)
+  ret i64 %ret
+}
+
+; property access 2 - %obj is an anyreg call argument and should therefore be in a register
+; CHECK-LABEL:  .long   L{{.*}}-_property_access2
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @property_access2() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %f, i32 1, i64* %obj)
+  ret i64 %ret
+}
+
+; property access 3 - %obj is a frame index
+; CHECK-LABEL:  .long   L{{.*}}-_property_access3
+; CHECK-NEXT:   .short  0
+; 2 locations
+; CHECK-NEXT:   .short  2
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Direct FP - 8
+; CHECK-NEXT:   .byte 2
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+; CHECK-NEXT:   .long -8
+define i64 @property_access3() nounwind ssp uwtable {
+entry:
+  %obj = alloca i64, align 8
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 20, i8* %f, i32 0, i64* %obj)
+  ret i64 %ret
+}
+
+; anyreg_test1
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test1
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 20, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; anyreg_test2
+; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test2
+; CHECK-NEXT:   .short  0
+; 14 locations
+; CHECK-NEXT:   .short  14
+; Loc 0: Register <-- this is the return register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 1: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 2: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 3: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 4: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 5: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 6: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 7: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 8: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 9: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 10: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 11: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 12: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+; Loc 13: Register
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short {{[0-9]+}}
+; CHECK-NEXT:   .long 0
+define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
+entry:
+  %f = inttoptr i64 281474417671919 to i8*
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  ret i64 %ret
+}
+
+; Test spilling the return value of an anyregcc call.
+;
+; <rdar://problem/15432754> [JS] Assertion: "Folded a def to a non-store!"
+;
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spilldef
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 3
+; Loc 0: Register (some register that will be spilled to the stack)
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  ret i64 %result
+}
+
+; Test spilling the arguments of an anyregcc call.
+;
+; <rdar://problem/15487687> [JS] AnyRegCC argument ends up being spilled
+;
+; CHECK-LABEL: .long L{{.*}}-_patchpoint_spillargs
+; CHECK-NEXT: .short 0
+; CHECK-NEXT: .short 5
+; Loc 0: Return a register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 1: Arg0 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 2: Arg1 in a Register
+; CHECK-NEXT: .byte  1
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short {{[0-9]+}}
+; CHECK-NEXT: .long  0
+; Loc 3: Arg2 spilled to FP -96
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -96
+; Loc 4: Arg3 spilled to FP - 88
+; CHECK-NEXT: .byte  3
+; CHECK-NEXT: .byte  8
+; CHECK-NEXT: .short 29
+; CHECK-NEXT: .long -88
+define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 16, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  ret i64 %result
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/arith-saturating.ll b/test/CodeGen/ARM64/arith-saturating.ll
new file mode 100644
index 0000000000..437ebb8fe6
--- /dev/null
+++ b/test/CodeGen/ARM64/arith-saturating.ll
@@ -0,0 +1,153 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i32 @qadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qadds:
+; CHECK: sqadd s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqadd.i = tail call i32 @llvm.arm64.neon.sqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqadd.i
+}
+
+define i64 @qaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qaddd:
+; CHECK: sqadd d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqadd.i = tail call i64 @llvm.arm64.neon.sqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqadd.i
+}
+
+define i32 @uqadds(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqadds:
+; CHECK: uqadd s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqadd.i = tail call i32 @llvm.arm64.neon.uqadd.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqadd.i
+}
+
+define i64 @uqaddd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqaddd:
+; CHECK: uqadd d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqadd.i = tail call i64 @llvm.arm64.neon.uqadd.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqadd.i
+}
+
+declare i64 @llvm.arm64.neon.uqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.uqadd.i32(i32, i32) nounwind readnone
+declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32) nounwind readnone
+
+define i32 @qsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubs:
+; CHECK: sqsub s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqsub.i = tail call i32 @llvm.arm64.neon.sqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqsub.i
+}
+
+define i64 @qsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: qsubd:
+; CHECK: sqsub d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqsub.i = tail call i64 @llvm.arm64.neon.sqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqsub.i
+}
+
+define i32 @uqsubs(<4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubs:
+; CHECK: uqsub s0, s0, s1
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vecext1 = extractelement <4 x i32> %c, i32 0
+  %vqsub.i = tail call i32 @llvm.arm64.neon.uqsub.i32(i32 %vecext, i32 %vecext1) nounwind
+  ret i32 %vqsub.i
+}
+
+define i64 @uqsubd(<2 x i64> %b, <2 x i64> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: uqsubd:
+; CHECK: uqsub d0, d0, d1
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vecext1 = extractelement <2 x i64> %c, i32 0
+  %vqsub.i = tail call i64 @llvm.arm64.neon.uqsub.i64(i64 %vecext, i64 %vecext1) nounwind
+  ret i64 %vqsub.i
+}
+
+declare i64 @llvm.arm64.neon.uqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.uqsub.i32(i32, i32) nounwind readnone
+declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32) nounwind readnone
+
+define i32 @qabss(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qabss:
+; CHECK: sqabs s0, s0
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vqabs.i = tail call i32 @llvm.arm64.neon.sqabs.i32(i32 %vecext) nounwind
+  ret i32 %vqabs.i
+}
+
+define i64 @qabsd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qabsd:
+; CHECK: sqabs d0, d0
+; CHECK: ret
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqabs.i = tail call i64 @llvm.arm64.neon.sqabs.i64(i64 %vecext) nounwind
+  ret i64 %vqabs.i
+}
+
+define i32 @qnegs(<4 x i32> %b, <4 x i32> %c) nounwind readnone {
+; CHECK-LABEL: qnegs:
+; CHECK: sqneg s0, s0
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %b, i32 0
+  %vqneg.i = tail call i32 @llvm.arm64.neon.sqneg.i32(i32 %vecext) nounwind
+  ret i32 %vqneg.i
+}
+
+define i64 @qnegd(<2 x i64> %b, <2 x i64> %c) nounwind readnone {
+; CHECK-LABEL: qnegd:
+; CHECK: sqneg d0, d0
+; CHECK: ret
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqneg.i = tail call i64 @llvm.arm64.neon.sqneg.i64(i64 %vecext) nounwind
+  ret i64 %vqneg.i
+}
+
+declare i64 @llvm.arm64.neon.sqneg.i64(i64) nounwind readnone
+declare i32 @llvm.arm64.neon.sqneg.i32(i32) nounwind readnone
+declare i64 @llvm.arm64.neon.sqabs.i64(i64) nounwind readnone
+declare i32 @llvm.arm64.neon.sqabs.i32(i32) nounwind readnone
+
+
+define i32 @vqmovund(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovund:
+; CHECK: sqxtun s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovun.i = tail call i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovun.i
+}
+
+define i32 @vqmovnd_s(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_s:
+; CHECK: sqxtn s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovn.i
+}
+
+define i32 @vqmovnd_u(<2 x i64> %b) nounwind readnone {
+; CHECK-LABEL: vqmovnd_u:
+; CHECK: uqxtn s0, d0
+  %vecext = extractelement <2 x i64> %b, i32 0
+  %vqmovn.i = tail call i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64 %vecext) nounwind
+  ret i32 %vqmovn.i
+}
+
+declare i32 @llvm.arm64.neon.scalar.uqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.arm64.neon.scalar.sqxtn.i32.i64(i64) nounwind readnone
+declare i32 @llvm.arm64.neon.scalar.sqxtun.i32.i64(i64) nounwind readnone
diff --git a/test/CodeGen/ARM64/arith.ll b/test/CodeGen/ARM64/arith.ll
new file mode 100644
index 0000000000..b6ff0da3b2
--- /dev/null
+++ b/test/CodeGen/ARM64/arith.ll
@@ -0,0 +1,262 @@
+; RUN: llc < %s -march=arm64 -asm-verbose=false | FileCheck %s
+
+define i32 @t1(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: add w0, w1, w0
+; CHECK: ret
+  %add = add i32 %b, %a
+  ret i32 %add
+}
+
+define i32 @t2(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: udiv w0, w0, w1
+; CHECK: ret
+  %udiv = udiv i32 %a, %b
+  ret i32 %udiv
+}
+
+define i64 @t3(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: udiv x0, x0, x1
+; CHECK: ret
+  %udiv = udiv i64 %a, %b
+  ret i64 %udiv
+}
+
+define i32 @t4(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: sdiv w0, w0, w1
+; CHECK: ret
+  %sdiv = sdiv i32 %a, %b
+  ret i32 %sdiv
+}
+
+define i64 @t5(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: sdiv x0, x0, x1
+; CHECK: ret
+  %sdiv = sdiv i64 %a, %b
+  ret i64 %sdiv
+}
+
+define i32 @t6(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: lslv w0, w0, w1
+; CHECK: ret
+  %shl = shl i32 %a, %b
+  ret i32 %shl
+}
+
+define i64 @t7(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: lslv x0, x0, x1
+; CHECK: ret
+  %shl = shl i64 %a, %b
+  ret i64 %shl
+}
+
+define i32 @t8(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: lsrv w0, w0, w1
+; CHECK: ret
+  %lshr = lshr i32 %a, %b
+  ret i32 %lshr
+}
+
+define i64 @t9(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t9:
+; CHECK: lsrv x0, x0, x1
+; CHECK: ret
+  %lshr = lshr i64 %a, %b
+  ret i64 %lshr
+}
+
+define i32 @t10(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t10:
+; CHECK: asrv w0, w0, w1
+; CHECK: ret
+  %ashr = ashr i32 %a, %b
+  ret i32 %ashr
+}
+
+define i64 @t11(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t11:
+; CHECK: asrv x0, x0, x1
+; CHECK: ret
+  %ashr = ashr i64 %a, %b
+  ret i64 %ashr
+}
+
+define i32 @t12(i16 %a, i32 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t12:
+; CHECK: add	w0, w1, w0, sxth
+; CHECK: ret
+  %c = sext i16 %a to i32
+  %e = add i32 %x, %c
+  ret i32 %e
+}
+
+define i32 @t13(i16 %a, i32 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t13:
+; CHECK: add	w0, w1, w0, sxth #2
+; CHECK: ret
+  %c = sext i16 %a to i32
+  %d = shl i32 %c, 2
+  %e = add i32 %x, %d
+  ret i32 %e
+}
+
+define i64 @t14(i16 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t14:
+; CHECK: add	x0, x1, w0, uxth #3
+; CHECK: ret
+  %c = zext i16 %a to i64
+  %d = shl i64 %c, 3
+  %e = add i64 %x, %d
+  ret i64 %e
+}
+
+; rdar://9160598
+define i64 @t15(i64 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t15:
+; CHECK: add x0, x1, w0, uxtw
+; CHECK: ret
+  %b = and i64 %a, 4294967295
+  %c = add i64 %x, %b
+  ret i64 %c
+}
+
+define i64 @t16(i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t16:
+; CHECK: lsl x0, x0, #1
+; CHECK: ret
+  %a = shl i64 %x, 1
+  ret i64 %a
+}
+
+; rdar://9166974
+define i64 @t17(i16 %a, i64 %x) nounwind ssp {
+entry:
+; CHECK-LABEL: t17:
+; CHECK: sxth [[REG:x[0-9]+]], x0
+; CHECK: sub x0, xzr, [[REG]], lsl #32
+; CHECK: ret
+  %tmp16 = sext i16 %a to i64
+  %tmp17 = mul i64 %tmp16, -4294967296
+  ret i64 %tmp17
+}
+
+define i32 @t18(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t18:
+; CHECK: sdiv w0, w0, w1
+; CHECK: ret
+  %sdiv = call i32 @llvm.arm64.sdiv.i32(i32 %a, i32 %b)
+  ret i32 %sdiv
+}
+
+define i64 @t19(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t19:
+; CHECK: sdiv x0, x0, x1
+; CHECK: ret
+  %sdiv = call i64 @llvm.arm64.sdiv.i64(i64 %a, i64 %b)
+  ret i64 %sdiv
+}
+
+define i32 @t20(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t20:
+; CHECK: udiv w0, w0, w1
+; CHECK: ret
+  %udiv = call i32 @llvm.arm64.udiv.i32(i32 %a, i32 %b)
+  ret i32 %udiv
+}
+
+define i64 @t21(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t21:
+; CHECK: udiv x0, x0, x1
+; CHECK: ret
+  %udiv = call i64 @llvm.arm64.udiv.i64(i64 %a, i64 %b)
+  ret i64 %udiv
+}
+
+declare i32 @llvm.arm64.sdiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.arm64.sdiv.i64(i64, i64) nounwind readnone
+declare i32 @llvm.arm64.udiv.i32(i32, i32) nounwind readnone
+declare i64 @llvm.arm64.udiv.i64(i64, i64) nounwind readnone
+
+; 32-bit not.
+define i32 @inv_32(i32 %x) nounwind ssp {
+entry:
+; CHECK: inv_32
+; CHECK: mvn w0, w0
+; CHECK: ret
+  %inv = xor i32 %x, -1
+  ret i32 %inv
+}
+
+; 64-bit not.
+define i64 @inv_64(i64 %x) nounwind ssp {
+entry:
+; CHECK: inv_64
+; CHECK: mvn x0, x0
+; CHECK: ret
+  %inv = xor i64 %x, -1
+  ret i64 %inv
+}
+
+; Multiplying by a power of two plus or minus one is better done via shift
+; and add/sub rather than the madd/msub instructions. The latter are 4+ cycles,
+; and the former are two (total for the two instruction sequence for subtract).
+define i32 @f0(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f0:
+; CHECK-NEXT: add w0, w0, w0, lsl #3
+; CHECK-NEXT: ret
+  %res = mul i32 %a, 9
+  ret i32 %res
+}
+
+define i64 @f1(i64 %a) nounwind readnone ssp {
+; CHECK-LABEL: f1:
+; CHECK-NEXT: lsl x8, x0, #4
+; CHECK-NEXT: sub x0, x8, x0
+; CHECK-NEXT: ret
+  %res = mul i64 %a, 15
+  ret i64 %res
+}
+
+define i32 @f2(i32 %a) nounwind readnone ssp {
+; CHECK-LABEL: f2:
+; CHECK-NEXT: lsl w8, w0, #3
+; CHECK-NEXT: sub w0, w8, w0
+; CHECK-NEXT: ret
+  %res = mul nsw i32 %a, 7
+  ret i32 %res
+}
+
+define i64 @f3(i64 %a) nounwind readnone ssp {
+; CHECK-LABEL: f3:
+; CHECK-NEXT: add x0, x0, x0, lsl #4
+; CHECK-NEXT: ret
+  %res = mul nsw i64 %a, 17
+  ret i64 %res
+}
diff --git a/test/CodeGen/ARM64/atomic-128.ll b/test/CodeGen/ARM64/atomic-128.ll
new file mode 100644
index 0000000000..a0039a3237
--- /dev/null
+++ b/test/CodeGen/ARM64/atomic-128.ll
@@ -0,0 +1,213 @@
+; RUN: llc < %s -march=arm64 -mtriple=arm64-linux-gnu -verify-machineinstrs | FileCheck %s
+
+@var = global i128 0
+
+define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp   [[RESULTLO:x[0-9]+]], [[RESULTHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[RESULTLO]], x2
+; CHECK: sbc    xzr, [[RESULTHI]], x3
+; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK: stxp   [[SCRATCH_RES:w[0-9]+]], x4, x5, [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+  %val = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
+  ret i128 %val
+}
+
+define void @fetch_and_nand(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_nand:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: bic    [[SCRATCH_REGLO:x[0-9]+]], x2, [[DEST_REGLO]]
+; CHECK: bic    [[SCRATCH_REGHI:x[0-9]+]], x3, [[DEST_REGHI]]
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw nand i128* %p, i128 %bits release
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_or(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_or:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: orr    [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: orr    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw or i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_add(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_add:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: adds   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: adc    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw add i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_sub(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_sub:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: subs   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2
+; CHECK: sbc    [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw sub i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_min(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_min:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[DEST_REGLO]], x2
+; CHECK: sbc    xzr, [[DEST_REGHI]], x3
+; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, lt
+; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, lt
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw min i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_max(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_max:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[DEST_REGLO]], x2
+; CHECK: sbc    xzr, [[DEST_REGHI]], x3
+; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, gt
+; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, gt
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw max i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_umin(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umin:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[DEST_REGLO]], x2
+; CHECK: sbc    xzr, [[DEST_REGHI]], x3
+; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, cc
+; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, cc
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw umin i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define void @fetch_and_umax(i128* %p, i128 %bits) {
+; CHECK-LABEL: fetch_and_umax:
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp  [[DEST_REGLO:x[0-9]+]], [[DEST_REGHI:x[0-9]+]], [x0]
+; CHECK: cmp    [[DEST_REGLO]], x2
+; CHECK: sbc    xzr, [[DEST_REGHI]], x3
+; CHECK: csel   [[SCRATCH_REGLO:x[0-9]+]], [[DEST_REGLO]], x2, hi
+; CHECK: csel   [[SCRATCH_REGHI:x[0-9]+]], [[DEST_REGHI]], x3, hi
+; CHECK: stlxp  [[SCRATCH_RES:w[0-9]+]], [[SCRATCH_REGLO]], [[SCRATCH_REGHI]], [x0]
+; CHECK: cbnz   [[SCRATCH_RES]], [[LABEL]]
+
+; CHECK: str    [[DEST_REGHI]]
+; CHECK: str    [[DEST_REGLO]]
+  %val = atomicrmw umax i128* %p, i128 %bits seq_cst
+  store i128 %val, i128* @var, align 16
+  ret void
+}
+
+define i128 @atomic_load_seq_cst(i128* %p) {
+; CHECK-LABEL: atomic_load_seq_cst:
+; CHECK-NOT: dmb
+; CHECK-LABEL: ldaxp
+; CHECK-NOT: dmb
+   %r = load atomic i128* %p seq_cst, align 16
+   ret i128 %r
+}
+
+define i128 @atomic_load_relaxed(i128* %p) {
+; CHECK-LABEL: atomic_load_relaxed:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp [[LO:x[0-9]+]], [[HI:x[0-9]+]], [x0]
+; CHECK: orr [[SAMELO:x[0-9]+]], [[LO]], xzr
+; CHECK: orr [[SAMEHI:x[0-9]+]], [[HI]], xzr
+; CHECK: stxp [[SUCCESS:w[0-9]+]], [[SAMELO]], [[SAMEHI]], [x0]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   %r = load atomic i128* %p monotonic, align 16
+   ret i128 %r
+}
+
+
+define void @atomic_store_seq_cst(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_seq_cst:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxp xzr, xzr, [x2]
+; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p seq_cst, align 16
+   ret void
+}
+
+define void @atomic_store_release(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_release:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: stlxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p release, align 16
+   ret void
+}
+
+define void @atomic_store_relaxed(i128 %in, i128* %p) {
+; CHECK-LABEL: atomic_store_relaxed:
+; CHECK-NOT: dmb
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxp xzr, xzr, [x2]
+; CHECK: stxp [[SUCCESS:w[0-9]+]], x0, x1, [x2]
+; CHECK: cbnz [[SUCCESS]], [[LABEL]]
+; CHECK-NOT: dmb
+   store atomic i128 %in, i128* %p unordered, align 16
+   ret void
+}
diff --git a/test/CodeGen/ARM64/atomic.ll b/test/CodeGen/ARM64/atomic.ll
new file mode 100644
index 0000000000..cf8cf7d7d9
--- /dev/null
+++ b/test/CodeGen/ARM64/atomic.ll
@@ -0,0 +1,343 @@
+; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
+
+define i32 @val_compare_and_swap(i32* %p) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK: orr    [[NEWVAL_REG:w[0-9]+]], wzr, #0x4
+; CHECK: orr    [[OLDVAL_REG:w[0-9]+]], wzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   [[RESULT:w[0-9]+]], [x0]
+; CHECK: cmp    [[RESULT]], [[OLDVAL_REG]]
+; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+  %val = cmpxchg i32* %p, i32 7, i32 4 acquire acquire
+  ret i32 %val
+}
+
+define i64 @val_compare_and_swap_64(i64* %p) {
+; CHECK-LABEL: val_compare_and_swap_64:
+; CHECK: orr    [[NEWVAL_REG:x[0-9]+]], xzr, #0x4
+; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   [[RESULT:x[0-9]+]], [x0]
+; CHECK: cmp    [[RESULT]], [[OLDVAL_REG]]
+; CHECK: b.ne   [[LABEL2:.?LBB[0-9]+_[0-9]+]]
+; CHECK-NOT: stxr [[NEWVAL_REG]], [[NEWVAL_REG]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEWVAL_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: [[LABEL2]]:
+  %val = cmpxchg i64* %p, i64 7, i64 4 monotonic monotonic
+  ret i64 %val
+}
+
+define i32 @fetch_and_nand(i32* %p) {
+; CHECK-LABEL: fetch_and_nand:
+; CHECK: orr    [[OLDVAL_REG:w[0-9]+]], wzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   w[[DEST_REG:[0-9]+]], [x0]
+; CHECK: bic    [[SCRATCH2_REG:w[0-9]+]], [[OLDVAL_REG]], w[[DEST_REG]]
+; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
+; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, x[[DEST_REG]]
+  %val = atomicrmw nand i32* %p, i32 7 release
+  ret i32 %val
+}
+
+define i64 @fetch_and_nand_64(i64* %p) {
+; CHECK-LABEL: fetch_and_nand_64:
+; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   [[DEST_REG:x[0-9]+]], [x0]
+; CHECK: bic    [[SCRATCH2_REG:x[0-9]+]], [[OLDVAL_REG]], [[DEST_REG]]
+; CHECK: stlxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, [[DEST_REG]]
+  %val = atomicrmw nand i64* %p, i64 7 acq_rel
+  ret i64 %val
+}
+
+define i32 @fetch_and_or(i32* %p) {
+; CHECK-LABEL: fetch_and_or:
+; CHECK: movz   [[OLDVAL_REG:w[0-9]+]], #5
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldaxr   w[[DEST_REG:[0-9]+]], [x0]
+; CHECK: orr    [[SCRATCH2_REG:w[0-9]+]], w[[DEST_REG]], [[OLDVAL_REG]]
+; CHECK-NOT: stlxr [[SCRATCH2_REG]], [[SCRATCH2_REG]]
+; CHECK: stlxr [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, x[[DEST_REG]]
+  %val = atomicrmw or i32* %p, i32 5 seq_cst
+  ret i32 %val
+}
+
+define i64 @fetch_and_or_64(i64* %p) {
+; CHECK: fetch_and_or_64:
+; CHECK: orr    [[OLDVAL_REG:x[0-9]+]], xzr, #0x7
+; CHECK: [[LABEL:.?LBB[0-9]+_[0-9]+]]:
+; CHECK: ldxr   [[DEST_REG:x[0-9]+]], [x0]
+; CHECK: orr    [[SCRATCH2_REG:x[0-9]+]], [[DEST_REG]], [[OLDVAL_REG]]
+; CHECK: stxr   [[SCRATCH_REG:w[0-9]+]], [[SCRATCH2_REG]], [x0]
+; CHECK: cbnz   [[SCRATCH_REG]], [[LABEL]]
+; CHECK: mov    x0, [[DEST_REG]]
+  %val = atomicrmw or i64* %p, i64 7 monotonic
+  ret i64 %val
+}
+
+define void @acquire_fence() {
+   fence acquire
+   ret void
+   ; CHECK-LABEL: acquire_fence:
+   ; CHECK: dmb ishld
+}
+
+define void @release_fence() {
+   fence release
+   ret void
+   ; CHECK-LABEL: release_fence:
+   ; CHECK: dmb ish{{$}}
+}
+
+define void @seq_cst_fence() {
+   fence seq_cst
+   ret void
+   ; CHECK-LABEL: seq_cst_fence:
+   ; CHECK: dmb ish{{$}}
+}
+
+define i32 @atomic_load(i32* %p) {
+   %r = load atomic i32* %p seq_cst, align 4
+   ret i32 %r
+   ; CHECK-LABEL: atomic_load:
+   ; CHECK: ldar
+}
+
+define i8 @atomic_load_relaxed_8(i8* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_8:
+  %ptr_unsigned = getelementptr i8* %p, i32 4095
+  %val_unsigned = load atomic i8* %ptr_unsigned monotonic, align 1
+; CHECK: ldrb {{w[0-9]+}}, [x0, #4095]
+
+  %ptr_regoff = getelementptr i8* %p, i32 %off32
+  %val_regoff = load atomic i8* %ptr_regoff unordered, align 1
+  %tot1 = add i8 %val_unsigned, %val_regoff
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: ldrb {{w[0-9]+}}, [x0, x1, sxtw]
+
+  %ptr_unscaled = getelementptr i8* %p, i32 -256
+  %val_unscaled = load atomic i8* %ptr_unscaled monotonic, align 1
+  %tot2 = add i8 %tot1, %val_unscaled
+; CHECK: ldurb {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
+  %val_random = load atomic i8* %ptr_random unordered, align 1
+  %tot3 = add i8 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: ldrb {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i8 %tot3
+}
+
+define i16 @atomic_load_relaxed_16(i16* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_16:
+  %ptr_unsigned = getelementptr i16* %p, i32 4095
+  %val_unsigned = load atomic i16* %ptr_unsigned monotonic, align 2
+; CHECK: ldrh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr i16* %p, i32 %off32
+  %val_regoff = load atomic i16* %ptr_regoff unordered, align 2
+  %tot1 = add i16 %val_unsigned, %val_regoff
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: ldrh {{w[0-9]+}}, [x0, x1, sxtw #1]
+
+  %ptr_unscaled = getelementptr i16* %p, i32 -128
+  %val_unscaled = load atomic i16* %ptr_unscaled monotonic, align 2
+  %tot2 = add i16 %tot1, %val_unscaled
+; CHECK: ldurh {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
+  %val_random = load atomic i16* %ptr_random unordered, align 2
+  %tot3 = add i16 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: ldrh {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i16 %tot3
+}
+
+define i32 @atomic_load_relaxed_32(i32* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_32:
+  %ptr_unsigned = getelementptr i32* %p, i32 4095
+  %val_unsigned = load atomic i32* %ptr_unsigned monotonic, align 4
+; CHECK: ldr {{w[0-9]+}}, [x0, #16380]
+
+  %ptr_regoff = getelementptr i32* %p, i32 %off32
+  %val_regoff = load atomic i32* %ptr_regoff unordered, align 4
+  %tot1 = add i32 %val_unsigned, %val_regoff
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: ldr {{w[0-9]+}}, [x0, x1, sxtw #2]
+
+  %ptr_unscaled = getelementptr i32* %p, i32 -64
+  %val_unscaled = load atomic i32* %ptr_unscaled monotonic, align 4
+  %tot2 = add i32 %tot1, %val_unscaled
+; CHECK: ldur {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
+  %val_random = load atomic i32* %ptr_random unordered, align 4
+  %tot3 = add i32 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: ldr {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret i32 %tot3
+}
+
+define i64 @atomic_load_relaxed_64(i64* %p, i32 %off32) {
+; CHECK-LABEL: atomic_load_relaxed_64:
+  %ptr_unsigned = getelementptr i64* %p, i32 4095
+  %val_unsigned = load atomic i64* %ptr_unsigned monotonic, align 8
+; CHECK: ldr {{x[0-9]+}}, [x0, #32760]
+
+  %ptr_regoff = getelementptr i64* %p, i32 %off32
+  %val_regoff = load atomic i64* %ptr_regoff unordered, align 8
+  %tot1 = add i64 %val_unsigned, %val_regoff
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: ldr {{x[0-9]+}}, [x0, x1, sxtw #3]
+
+  %ptr_unscaled = getelementptr i64* %p, i32 -32
+  %val_unscaled = load atomic i64* %ptr_unscaled monotonic, align 8
+  %tot2 = add i64 %tot1, %val_unscaled
+; CHECK: ldur {{x[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
+  %val_random = load atomic i64* %ptr_random unordered, align 8
+  %tot3 = add i64 %tot2, %val_random
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: ldr {{x[0-9]+}}, [x[[ADDR]]]
+
+  ret i64 %tot3
+}
+
+
+define void @atomc_store(i32* %p) {
+   store atomic i32 4, i32* %p seq_cst, align 4
+   ret void
+   ; CHECK-LABEL: atomc_store:
+   ; CHECK: stlr
+}
+
+define void @atomic_store_relaxed_8(i8* %p, i32 %off32, i8 %val) {
+; CHECK-LABEL: atomic_store_relaxed_8:
+  %ptr_unsigned = getelementptr i8* %p, i32 4095
+  store atomic i8 %val, i8* %ptr_unsigned monotonic, align 1
+; CHECK: strb {{w[0-9]+}}, [x0, #4095]
+
+  %ptr_regoff = getelementptr i8* %p, i32 %off32
+  store atomic i8 %val, i8* %ptr_regoff unordered, align 1
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: strb {{w[0-9]+}}, [x0, x1, sxtw]
+
+  %ptr_unscaled = getelementptr i8* %p, i32 -256
+  store atomic i8 %val, i8* %ptr_unscaled monotonic, align 1
+; CHECK: sturb {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i8* %p, i32 1191936 ; 0x123000 (i.e. ADD imm)
+  store atomic i8 %val, i8* %ptr_random unordered, align 1
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: strb {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_16(i16* %p, i32 %off32, i16 %val) {
+; CHECK-LABEL: atomic_store_relaxed_16:
+  %ptr_unsigned = getelementptr i16* %p, i32 4095
+  store atomic i16 %val, i16* %ptr_unsigned monotonic, align 2
+; CHECK: strh {{w[0-9]+}}, [x0, #8190]
+
+  %ptr_regoff = getelementptr i16* %p, i32 %off32
+  store atomic i16 %val, i16* %ptr_regoff unordered, align 2
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: strh {{w[0-9]+}}, [x0, x1, sxtw #1]
+
+  %ptr_unscaled = getelementptr i16* %p, i32 -128
+  store atomic i16 %val, i16* %ptr_unscaled monotonic, align 2
+; CHECK: sturh {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i16* %p, i32 595968 ; 0x123000/2 (i.e. ADD imm)
+  store atomic i16 %val, i16* %ptr_random unordered, align 2
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: strh {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_32(i32* %p, i32 %off32, i32 %val) {
+; CHECK-LABEL: atomic_store_relaxed_32:
+  %ptr_unsigned = getelementptr i32* %p, i32 4095
+  store atomic i32 %val, i32* %ptr_unsigned monotonic, align 4
+; CHECK: str {{w[0-9]+}}, [x0, #16380]
+
+  %ptr_regoff = getelementptr i32* %p, i32 %off32
+  store atomic i32 %val, i32* %ptr_regoff unordered, align 4
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: str {{w[0-9]+}}, [x0, x1, sxtw #2]
+
+  %ptr_unscaled = getelementptr i32* %p, i32 -64
+  store atomic i32 %val, i32* %ptr_unscaled monotonic, align 4
+; CHECK: stur {{w[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i32* %p, i32 297984 ; 0x123000/4 (i.e. ADD imm)
+  store atomic i32 %val, i32* %ptr_random unordered, align 4
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: str {{w[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+define void @atomic_store_relaxed_64(i64* %p, i32 %off32, i64 %val) {
+; CHECK-LABEL: atomic_store_relaxed_64:
+  %ptr_unsigned = getelementptr i64* %p, i32 4095
+  store atomic i64 %val, i64* %ptr_unsigned monotonic, align 8
+; CHECK: str {{x[0-9]+}}, [x0, #32760]
+
+  %ptr_regoff = getelementptr i64* %p, i32 %off32
+  store atomic i64 %val, i64* %ptr_regoff unordered, align 8
+  ; FIXME: syntax is incorrect: "sxtw" should not be able to go with an x-reg.
+; CHECK: str {{x[0-9]+}}, [x0, x1, sxtw #3]
+
+  %ptr_unscaled = getelementptr i64* %p, i32 -32
+  store atomic i64 %val, i64* %ptr_unscaled monotonic, align 8
+; CHECK: stur {{x[0-9]+}}, [x0, #-256]
+
+  %ptr_random = getelementptr i64* %p, i32 148992 ; 0x123000/8 (i.e. ADD imm)
+  store atomic i64 %val, i64* %ptr_random unordered, align 8
+; CHECK: add x[[ADDR:[0-9]+]], x0, #1191936
+; CHECK: str {{x[0-9]+}}, [x[[ADDR]]]
+
+  ret void
+}
+
+; rdar://11531169
+; rdar://11531308
+
+%"class.X::Atomic" = type { %struct.x_atomic_t }
+%struct.x_atomic_t = type { i32 }
+
+@counter = external hidden global %"class.X::Atomic", align 4
+
+define i32 @next_id() nounwind optsize ssp align 2 {
+entry:
+  %0 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
+  %add.i = add i32 %0, 1
+  %tobool = icmp eq i32 %add.i, 0
+  br i1 %tobool, label %if.else, label %return
+
+if.else:                                          ; preds = %entry
+  %1 = atomicrmw add i32* getelementptr inbounds (%"class.X::Atomic"* @counter, i64 0, i32 0, i32 0), i32 1 seq_cst
+  %add.i2 = add i32 %1, 1
+  br label %return
+
+return:                                           ; preds = %if.else, %entry
+  %retval.0 = phi i32 [ %add.i2, %if.else ], [ %add.i, %entry ]
+  ret i32 %retval.0
+}
diff --git a/test/CodeGen/ARM64/big-imm-offsets.ll b/test/CodeGen/ARM64/big-imm-offsets.ll
new file mode 100644
index 0000000000..a56df07a49
--- /dev/null
+++ b/test/CodeGen/ARM64/big-imm-offsets.ll
@@ -0,0 +1,14 @@
+; RUN: llc -march=arm64 < %s
+
+
+; Make sure large offsets aren't mistaken for valid immediate offsets.
+; <rdar://problem/13190511>
+define void @f(i32* nocapture %p) {
+entry:
+  %a = ptrtoint i32* %p to i64
+  %ao = add i64 %a, 25769803792
+  %b = inttoptr i64 %ao to i32*
+  store volatile i32 0, i32* %b, align 4
+  store volatile i32 0, i32* %b, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM64/big-stack.ll b/test/CodeGen/ARM64/big-stack.ll
new file mode 100644
index 0000000000..56ca30c17b
--- /dev/null
+++ b/test/CodeGen/ARM64/big-stack.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s | FileCheck %s
+target triple = "arm64-apple-macosx10"
+
+; Check that big stacks are generated correctly.
+; Currently, this is done by a sequence of sub instructions,
+; which can encode immediate with a 12 bits mask an optionally
+; shift left (up to 12). I.e., 16773120 is the biggest value.
+; <rdar://12513931>
+; CHECK-LABEL: foo:
+; CHECK: sub sp, sp, #16773120
+; CHECK: sub sp, sp, #16773120
+; CHECK: sub sp, sp, #8192
+define void @foo() nounwind ssp {
+entry:
+  %buffer = alloca [33554432 x i8], align 1
+  %arraydecay = getelementptr inbounds [33554432 x i8]* %buffer, i64 0, i64 0
+  call void @doit(i8* %arraydecay) nounwind
+  ret void
+}
+
+declare void @doit(i8*)
diff --git a/test/CodeGen/ARM64/bitfield-extract.ll b/test/CodeGen/ARM64/bitfield-extract.ll
new file mode 100644
index 0000000000..96b6967a97
--- /dev/null
+++ b/test/CodeGen/ARM64/bitfield-extract.ll
@@ -0,0 +1,406 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+%struct.X = type { i8, i8, [2 x i8] }
+%struct.Y = type { i32, i8 }
+%struct.Z = type { i8, i8, [2 x i8], i16 }
+%struct.A = type { i64, i8 }
+
+define void @foo(%struct.X* nocapture %x, %struct.Y* nocapture %y) nounwind optsize ssp {
+; CHECK-LABEL: foo:
+; CHECK: ubfm
+; CHECK-NOT: and
+; CHECK: ret
+
+  %tmp = bitcast %struct.X* %x to i32*
+  %tmp1 = load i32* %tmp, align 4
+  %b = getelementptr inbounds %struct.Y* %y, i64 0, i32 1
+  %bf.clear = lshr i32 %tmp1, 3
+  %bf.clear.lobit = and i32 %bf.clear, 1
+  %frombool = trunc i32 %bf.clear.lobit to i8
+  store i8 %frombool, i8* %b, align 1
+  ret void
+}
+
+define i32 @baz(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: baz:
+; CHECK: sbfm  w0, w0, #0, #3
+  %tmp = trunc i64 %cav1.coerce to i32
+  %tmp1 = shl i32 %tmp, 28
+  %bf.val.sext = ashr exact i32 %tmp1, 28
+  ret i32 %bf.val.sext
+}
+
+define i32 @bar(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: sbfm  w0, w0, #4, #9
+  %tmp = trunc i64 %cav1.coerce to i32
+  %cav1.sroa.0.1.insert = shl i32 %tmp, 22
+  %tmp1 = ashr i32 %cav1.sroa.0.1.insert, 26
+  ret i32 %tmp1
+}
+
+define void @fct1(%struct.Z* nocapture %x, %struct.A* nocapture %y) nounwind optsize ssp {
+; CHECK-LABEL: fct1:
+; CHECK: ubfm
+; CHECK-NOT: and
+; CHECK: ret
+
+  %tmp = bitcast %struct.Z* %x to i64*
+  %tmp1 = load i64* %tmp, align 4
+  %b = getelementptr inbounds %struct.A* %y, i64 0, i32 0
+  %bf.clear = lshr i64 %tmp1, 3
+  %bf.clear.lobit = and i64 %bf.clear, 1
+  store i64 %bf.clear.lobit, i64* %b, align 8
+  ret void
+}
+
+define i64 @fct2(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: fct2:
+; CHECK: sbfm  x0, x0, #0, #35
+  %tmp = shl i64 %cav1.coerce, 28
+  %bf.val.sext = ashr exact i64 %tmp, 28
+  ret i64 %bf.val.sext
+}
+
+define i64 @fct3(i64 %cav1.coerce) nounwind {
+; CHECK-LABEL: fct3:
+; CHECK: sbfm  x0, x0, #4, #41
+  %cav1.sroa.0.1.insert = shl i64 %cav1.coerce, 22
+  %tmp1 = ashr i64 %cav1.sroa.0.1.insert, 26
+  ret i64 %tmp1
+}
+
+define void @fct4(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct4:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #39
+; CHECK-NEXT: str [[REG1]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -16777216
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 16777215
+  %or = or i64 %and, %and1
+  store i64 %or, i64* %y, align 8
+  ret void
+}
+
+define void @fct5(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct5:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; CHECK-NEXT: str [[REG1]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  store i32 %or, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some low bits
+define void @fct6(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct6:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shr1 = lshr i32 %or, 2
+  store i32 %shr1, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+define void @fct7(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct7:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  store i32 %shl, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some low bits
+; (i64 version)
+define void @fct8(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct8:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shr1 = lshr i64 %or, 2
+  store i64 %shr1, i64* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; (i64 version)
+define void @fct9(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct9:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  store i64 %shl, i64* %y, align 8
+  ret void
+}
+
+; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
+; (i32 version)
+define void @fct10(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct10:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #0, #2
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:w[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %and1 = and i32 %x, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  store i32 %shl, i32* %y, align 8
+  ret void
+}
+
+; Check if we can catch bfm instruction when lsb is 0 (i.e., no lshr)
+; (i64 version)
+define void @fct11(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct11:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #0, #2
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], [[REG1]], #2
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %and1 = and i64 %x, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  store i64 %shl, i64* %y, align 8
+  ret void
+}
+
+define zeroext i1 @fct12bis(i32 %tmp2) unnamed_addr nounwind ssp align 2 {
+; CHECK-LABEL: fct12bis:
+; CHECK-NOT: and
+; CHECK: ubfm w0, w0, #11, #11
+  %and.i.i = and i32 %tmp2, 2048
+  %tobool.i.i = icmp ne i32 %and.i.i, 0
+  ret i1 %tobool.i.i
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+define void @fct12(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct12:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfm [[REG2:w[0-9]+]], [[REG1]], #2, #29
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -8
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  %shr2 = lshr i32 %shl, 4
+  store i32 %shr2, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+; (i64 version)
+define void @fct13(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct13:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfm [[REG2:x[0-9]+]], [[REG1]], #2, #61
+; CHECK-NEXT: str [[REG2]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -8
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  %shr2 = lshr i64 %shl, 4
+  store i64 %shr2, i64* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+define void @fct14(i32* nocapture %y, i32 %x, i32 %x1) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct14:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], w1, #16, #23
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:w[0-9]+]], [[REG1]], #4
+; CHECK-NEXT: bfm [[REG2]], w2, #5, #7
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG3:w[0-9]+]], [[REG2]], #2
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, -256
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 255
+  %or = or i32 %and, %and1
+  %shl = lshr i32 %or, 4
+  %and2 = and i32 %shl, -8
+  %shr1 = lshr i32 %x1, 5
+  %and3 = and i32 %shr1, 7
+  %or1 = or i32 %and2, %and3
+  %shl1 = shl i32 %or1, 2
+  store i32 %shl1, i32* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits
+; (i64 version)
+define void @fct15(i64* nocapture %y, i64 %x, i64 %x1) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct15:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; CHECK-NEXT: bfm [[REG1]], x1, #16, #23
+; lsr is an alias of ubfm
+; CHECK-NEXT: lsr [[REG2:x[0-9]+]], [[REG1]], #4
+; CHECK-NEXT: bfm [[REG2]], x2, #5, #7
+; lsl is an alias of ubfm
+; CHECK-NEXT: lsl [[REG3:x[0-9]+]], [[REG2]], #2
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, -256
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 255
+  %or = or i64 %and, %and1
+  %shl = lshr i64 %or, 4
+  %and2 = and i64 %shl, -8
+  %shr1 = lshr i64 %x1, 5
+  %and3 = and i64 %shr1, 7
+  %or1 = or i64 %and2, %and3
+  %shl1 = shl i64 %or1, 2
+  store i64 %shl1, i64* %y, align 8
+  ret void
+}
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits and a masking operation has to be kept
+define void @fct16(i32* nocapture %y, i32 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct16:
+; CHECK: ldr [[REG1:w[0-9]+]],
+; Create the constant
+; CHECK: movz [[REGCST:w[0-9]+]], #26, lsl #16
+; CHECK: movk [[REGCST]], #33120
+; Do the masking
+; CHECK: and [[REG2:w[0-9]+]], [[REG1]], [[REGCST]]
+; CHECK-NEXT: bfm [[REG2]], w1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfm [[REG3:w[0-9]+]], [[REG2]], #2, #29
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i32* %y, align 8
+  %and = and i32 %0, 1737056
+  %shr = lshr i32 %x, 16
+  %and1 = and i32 %shr, 7
+  %or = or i32 %and, %and1
+  %shl = shl i32 %or, 2
+  %shr2 = lshr i32 %shl, 4
+  store i32 %shr2, i32* %y, align 8
+  ret void
+}
+
+
+; Check if we can still catch bfm instruction when we drop some high bits
+; and some low bits and a masking operation has to be kept
+; (i64 version)
+define void @fct17(i64* nocapture %y, i64 %x) nounwind optsize inlinehint ssp {
+entry:
+; CHECK-LABEL: fct17:
+; CHECK: ldr [[REG1:x[0-9]+]],
+; Create the constant
+; CHECK: movz [[REGCST:x[0-9]+]], #26, lsl #16
+; CHECK: movk [[REGCST]], #33120
+; Do the masking
+; CHECK: and [[REG2:x[0-9]+]], [[REG1]], [[REGCST]]
+; CHECK-NEXT: bfm [[REG2]], x1, #16, #18
+; lsr is an alias of ubfm
+; CHECK-NEXT: ubfm [[REG3:x[0-9]+]], [[REG2]], #2, #61
+; CHECK-NEXT: str [[REG3]],
+; CHECK-NEXT: ret
+  %0 = load i64* %y, align 8
+  %and = and i64 %0, 1737056
+  %shr = lshr i64 %x, 16
+  %and1 = and i64 %shr, 7
+  %or = or i64 %and, %and1
+  %shl = shl i64 %or, 2
+  %shr2 = lshr i64 %shl, 4
+  store i64 %shr2, i64* %y, align 8
+  ret void
+}
+
+define i64 @fct18(i32 %xor72) nounwind ssp {
+; CHECK-LABEL: fct18:
+; CHECK: ubfm x0, x0, #9, #16
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %result = and i64 %conv82, 255
+  ret i64 %result
+}
diff --git a/test/CodeGen/ARM64/blockaddress.ll b/test/CodeGen/ARM64/blockaddress.ll
new file mode 100644
index 0000000000..ac4f19e65d
--- /dev/null
+++ b/test/CodeGen/ARM64/blockaddress.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+; RUN: llc < %s -mtriple=arm64-linux-gnu -code-model=large| FileCheck %s --check-prefix=CHECK-LARGE
+
+; rdar://9188695
+
+define i64 @t() nounwind ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: adrp [[REG:x[0-9]+]], Ltmp1@PAGE
+; CHECK: add {{x[0-9]+}}, [[REG]], Ltmp1@PAGEOFF
+
+; CHECK-LINUX-LABEL: t:
+; CHECK-LINUX: adrp [[REG:x[0-9]+]], .Ltmp1
+; CHECK-LINUX: add {{x[0-9]+}}, [[REG]], :lo12:.Ltmp1
+
+; CHECK-LARGE-LABEL: t:
+; CHECK-LARGE: movz [[ADDR_REG:x[0-9]+]], #:abs_g3:[[DEST_LBL:.Ltmp[0-9]+]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g2_nc:[[DEST_LBL]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g1_nc:[[DEST_LBL]]
+; CHECK-LARGE: movk [[ADDR_REG]], #:abs_g0_nc:[[DEST_LBL]]
+
+  %recover = alloca i64, align 8
+  store volatile i64 ptrtoint (i8* blockaddress(@t, %mylabel) to i64), i64* %recover, align 8
+  br label %mylabel
+
+mylabel:
+  %tmp = load volatile i64* %recover, align 8
+  ret i64 %tmp
+}
diff --git a/test/CodeGen/ARM64/build-vector.ll b/test/CodeGen/ARM64/build-vector.ll
new file mode 100644
index 0000000000..1d137ae6e6
--- /dev/null
+++ b/test/CodeGen/ARM64/build-vector.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+; Check that building up a vector w/ only one non-zero lane initializes
+; intelligently.
+define void @one_lane(i32* nocapture %out_int, i32 %skip0) nounwind {
+; CHECK-LABEL: one_lane:
+; CHECK: dup.16b v[[REG:[0-9]+]], wzr
+; CHECK-NEXT: ins.b v[[REG]][0], w1
+; v and q are aliases, and str is prefered against st.16b when possible
+; rdar://11246289
+; CHECK: str q[[REG]], [x0]
+; CHECK: ret
+  %conv = trunc i32 %skip0 to i8
+  %vset_lane = insertelement <16 x i8> <i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, i8 %conv, i32 0
+  %tmp = bitcast i32* %out_int to <4 x i32>*
+  %tmp1 = bitcast <16 x i8> %vset_lane to <4 x i32>
+  store <4 x i32> %tmp1, <4 x i32>* %tmp, align 16
+  ret void
+}
+
+; Check that building a vector from floats doesn't insert an unnecessary
+; copy for lane zero.
+define <4 x float>  @foo(float %a, float %b, float %c, float %d) nounwind {
+; CHECK-LABEL: foo:
+; CHECK-NOT: ins.s v0[0], v0[0]
+; CHECK: ins.s v0[1], v1[0]
+; CHECK: ins.s v0[2], v2[0]
+; CHECK: ins.s v0[3], v3[0]
+; CHECK: ret
+  %1 = insertelement <4 x float> undef, float %a, i32 0
+  %2 = insertelement <4 x float> %1, float %b, i32 1
+  %3 = insertelement <4 x float> %2, float %c, i32 2
+  %4 = insertelement <4 x float> %3, float %d, i32 3
+  ret <4 x float> %4
+}
diff --git a/test/CodeGen/ARM64/call-tailcalls.ll b/test/CodeGen/ARM64/call-tailcalls.ll
new file mode 100644
index 0000000000..487c1d9bec
--- /dev/null
+++ b/test/CodeGen/ARM64/call-tailcalls.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+@t = weak global i32 ()* null
+@x = external global i32, align 4
+
+define void @t2() {
+; CHECK-LABEL: t2:
+; CHECK: adrp	x[[GOTADDR:[0-9]+]], _t@GOTPAGE
+; CHECK: ldr	x[[ADDR:[0-9]+]], [x[[GOTADDR]], _t@GOTPAGEOFF]
+; CHECK: ldr	x[[DEST:[0-9]+]], [x[[ADDR]]]
+; CHECK: br	x[[DEST]]
+  %tmp = load i32 ()** @t
+  %tmp.upgrd.2 = tail call i32 %tmp()
+  ret void
+}
+
+define void @t3() {
+; CHECK-LABEL: t3:
+; CHECK: b	_t2
+  tail call void @t2()
+  ret void
+}
+
+define double @t4(double %a) nounwind readonly ssp {
+; CHECK-LABEL: t4:
+; CHECK: b	_sin
+  %tmp = tail call double @sin(double %a) nounwind readonly
+  ret double %tmp
+}
+
+define float @t5(float %a) nounwind readonly ssp {
+; CHECK-LABEL: t5:
+; CHECK: b	_sinf
+  %tmp = tail call float @sinf(float %a) nounwind readonly
+  ret float %tmp
+}
+
+define void @t7() nounwind {
+; CHECK-LABEL: t7:
+; CHECK: b	_foo
+; CHECK: b	_bar
+
+  br i1 undef, label %bb, label %bb1.lr.ph
+
+bb1.lr.ph:                                        ; preds = %entry
+  tail call void @bar() nounwind
+  ret void
+
+bb:                                               ; preds = %entry
+  tail call void @foo() nounwind
+  ret void
+}
+
+define i32 @t8(i32 %x) nounwind ssp {
+; CHECK-LABEL: t8:
+; CHECK: b	_a
+; CHECK: b	_b
+; CHECK: b	_c
+  %and = and i32 %x, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 @a(i32 %x) nounwind
+  br label %return
+
+if.end:                                           ; preds = %entry
+  %and1 = and i32 %x, 2
+  %tobool2 = icmp eq i32 %and1, 0
+  br i1 %tobool2, label %if.end5, label %if.then3
+
+if.then3:                                         ; preds = %if.end
+  %call4 = tail call i32 @b(i32 %x) nounwind
+  br label %return
+
+if.end5:                                          ; preds = %if.end
+  %call6 = tail call i32 @c(i32 %x) nounwind
+  br label %return
+
+return:                                           ; preds = %if.end5, %if.then3, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ %call4, %if.then3 ], [ %call6, %if.end5 ]
+  ret i32 %retval.0
+}
+
+declare float @sinf(float) nounwind readonly
+declare double @sin(double) nounwind readonly
+declare void @bar() nounwind
+declare void @foo() nounwind
+declare i32 @a(i32)
+declare i32 @b(i32)
+declare i32 @c(i32)
diff --git a/test/CodeGen/ARM64/cast-opt.ll b/test/CodeGen/ARM64/cast-opt.ll
new file mode 100644
index 0000000000..3d7f25773a
--- /dev/null
+++ b/test/CodeGen/ARM64/cast-opt.ll
@@ -0,0 +1,31 @@
+; RUN: llc -O3 -march=arm64 -mtriple arm64-apple-ios5.0.0 < %s | FileCheck %s
+; <rdar://problem/15992732>
+; Zero truncation is not necessary when the values are extended properly
+; already.
+
+@block = common global i8* null, align 8
+
+define zeroext i8 @foo(i32 %i1, i32 %i2) {
+; CHECK-LABEL: foo:
+; CHECK: csinc
+; CHECK-NOT: and
+entry:
+  %idxprom = sext i32 %i1 to i64
+  %0 = load i8** @block, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 %idxprom
+  %1 = load i8* %arrayidx, align 1
+  %idxprom1 = sext i32 %i2 to i64
+  %arrayidx2 = getelementptr inbounds i8* %0, i64 %idxprom1
+  %2 = load i8* %arrayidx2, align 1
+  %cmp = icmp eq i8 %1, %2
+  br i1 %cmp, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %cmp7 = icmp ugt i8 %1, %2
+  %conv9 = zext i1 %cmp7 to i8
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i8 [ %conv9, %if.then ], [ 1, %entry ]
+  ret i8 %retval.0
+}
diff --git a/test/CodeGen/ARM64/ccmp-heuristics.ll b/test/CodeGen/ARM64/ccmp-heuristics.ll
new file mode 100644
index 0000000000..5575997e53
--- /dev/null
+++ b/test/CodeGen/ARM64/ccmp-heuristics.ll
@@ -0,0 +1,190 @@
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp | FileCheck %s
+target triple = "arm64-apple-ios7.0.0"
+
+@channelColumns = external global i64
+@channelTracks = external global i64
+@mazeRoute = external hidden unnamed_addr global i8*, align 8
+@TOP = external global i64*
+@BOT = external global i64*
+@netsAssign = external global i64*
+
+; Function from yacr2/maze.c
+; The branch at the end of %if.then is driven by %cmp5 and %cmp6.
+; Isel converts the and i1 into two branches, and arm64-ccmp should not convert
+; it back again. %cmp6 has much higher latency than %cmp5.
+; CHECK: Maze1
+; CHECK: %if.then
+; CHECK: cmp x{{[0-9]+}}, #2
+; CHECK-NEXT b.cc
+; CHECK: %if.then
+; CHECK: cmp x{{[0-9]+}}, #2
+; CHECK-NEXT b.cc
+define i32 @Maze1() nounwind ssp {
+entry:
+  %0 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp90 = icmp eq i64 %0, 0
+  br i1 %cmp90, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %1 = phi i64 [ %0, %entry ], [ %37, %for.inc ]
+  %i.092 = phi i64 [ 1, %entry ], [ %inc53, %for.inc ]
+  %numLeft.091 = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
+  %2 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx = getelementptr inbounds i8* %2, i64 %i.092
+  %3 = load i8* %arrayidx, align 1, !tbaa !1
+  %tobool = icmp eq i8 %3, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %4 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx1 = getelementptr inbounds i64* %4, i64 %i.092
+  %5 = load i64* %arrayidx1, align 8, !tbaa !0
+  %6 = load i64** @netsAssign, align 8, !tbaa !3
+  %arrayidx2 = getelementptr inbounds i64* %6, i64 %5
+  %7 = load i64* %arrayidx2, align 8, !tbaa !0
+  %8 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx3 = getelementptr inbounds i64* %8, i64 %i.092
+  %9 = load i64* %arrayidx3, align 8, !tbaa !0
+  %arrayidx4 = getelementptr inbounds i64* %6, i64 %9
+  %10 = load i64* %arrayidx4, align 8, !tbaa !0
+  %cmp5 = icmp ugt i64 %i.092, 1
+  %cmp6 = icmp ugt i64 %10, 1
+  %or.cond = and i1 %cmp5, %cmp6
+  br i1 %or.cond, label %land.lhs.true7, label %if.else
+
+land.lhs.true7:                                   ; preds = %if.then
+  %11 = load i64* @channelTracks, align 8, !tbaa !0
+  %add = add i64 %11, 1
+  %call = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add, i64 %10, i64 0, i64 %7, i32 -1, i32 -1)
+  %tobool8 = icmp eq i32 %call, 0
+  br i1 %tobool8, label %land.lhs.true7.if.else_crit_edge, label %if.then9
+
+land.lhs.true7.if.else_crit_edge:                 ; preds = %land.lhs.true7
+  %.pre = load i64* @channelColumns, align 8, !tbaa !0
+  br label %if.else
+
+if.then9:                                         ; preds = %land.lhs.true7
+  %12 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx10 = getelementptr inbounds i8* %12, i64 %i.092
+  store i8 0, i8* %arrayidx10, align 1, !tbaa !1
+  %13 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx11 = getelementptr inbounds i64* %13, i64 %i.092
+  %14 = load i64* %arrayidx11, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %14)
+  %15 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx12 = getelementptr inbounds i64* %15, i64 %i.092
+  %16 = load i64* %arrayidx12, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %16)
+  br label %for.inc
+
+if.else:                                          ; preds = %land.lhs.true7.if.else_crit_edge, %if.then
+  %17 = phi i64 [ %.pre, %land.lhs.true7.if.else_crit_edge ], [ %1, %if.then ]
+  %cmp13 = icmp ult i64 %i.092, %17
+  %or.cond89 = and i1 %cmp13, %cmp6
+  br i1 %or.cond89, label %land.lhs.true16, label %if.else24
+
+land.lhs.true16:                                  ; preds = %if.else
+  %18 = load i64* @channelTracks, align 8, !tbaa !0
+  %add17 = add i64 %18, 1
+  %call18 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 %add17, i64 %10, i64 0, i64 %7, i32 1, i32 -1)
+  %tobool19 = icmp eq i32 %call18, 0
+  br i1 %tobool19, label %if.else24, label %if.then20
+
+if.then20:                                        ; preds = %land.lhs.true16
+  %19 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx21 = getelementptr inbounds i8* %19, i64 %i.092
+  store i8 0, i8* %arrayidx21, align 1, !tbaa !1
+  %20 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx22 = getelementptr inbounds i64* %20, i64 %i.092
+  %21 = load i64* %arrayidx22, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %21)
+  %22 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx23 = getelementptr inbounds i64* %22, i64 %i.092
+  %23 = load i64* %arrayidx23, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %23)
+  br label %for.inc
+
+if.else24:                                        ; preds = %land.lhs.true16, %if.else
+  br i1 %cmp5, label %land.lhs.true26, label %if.else36
+
+land.lhs.true26:                                  ; preds = %if.else24
+  %24 = load i64* @channelTracks, align 8, !tbaa !0
+  %cmp27 = icmp ult i64 %7, %24
+  br i1 %cmp27, label %land.lhs.true28, label %if.else36
+
+land.lhs.true28:                                  ; preds = %land.lhs.true26
+  %add29 = add i64 %24, 1
+  %call30 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add29, i64 %10, i32 -1, i32 1)
+  %tobool31 = icmp eq i32 %call30, 0
+  br i1 %tobool31, label %if.else36, label %if.then32
+
+if.then32:                                        ; preds = %land.lhs.true28
+  %25 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx33 = getelementptr inbounds i8* %25, i64 %i.092
+  store i8 0, i8* %arrayidx33, align 1, !tbaa !1
+  %26 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx34 = getelementptr inbounds i64* %26, i64 %i.092
+  %27 = load i64* %arrayidx34, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %27)
+  %28 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx35 = getelementptr inbounds i64* %28, i64 %i.092
+  %29 = load i64* %arrayidx35, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %29)
+  br label %for.inc
+
+if.else36:                                        ; preds = %land.lhs.true28, %land.lhs.true26, %if.else24
+  %30 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp37 = icmp ult i64 %i.092, %30
+  br i1 %cmp37, label %land.lhs.true38, label %if.else48
+
+land.lhs.true38:                                  ; preds = %if.else36
+  %31 = load i64* @channelTracks, align 8, !tbaa !0
+  %cmp39 = icmp ult i64 %7, %31
+  br i1 %cmp39, label %land.lhs.true40, label %if.else48
+
+land.lhs.true40:                                  ; preds = %land.lhs.true38
+  %add41 = add i64 %31, 1
+  %call42 = tail call fastcc i32 @Maze1Mech(i64 %i.092, i64 0, i64 %7, i64 %add41, i64 %10, i32 1, i32 1)
+  %tobool43 = icmp eq i32 %call42, 0
+  br i1 %tobool43, label %if.else48, label %if.then44
+
+if.then44:                                        ; preds = %land.lhs.true40
+  %32 = load i8** @mazeRoute, align 8, !tbaa !3
+  %arrayidx45 = getelementptr inbounds i8* %32, i64 %i.092
+  store i8 0, i8* %arrayidx45, align 1, !tbaa !1
+  %33 = load i64** @TOP, align 8, !tbaa !3
+  %arrayidx46 = getelementptr inbounds i64* %33, i64 %i.092
+  %34 = load i64* %arrayidx46, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %34)
+  %35 = load i64** @BOT, align 8, !tbaa !3
+  %arrayidx47 = getelementptr inbounds i64* %35, i64 %i.092
+  %36 = load i64* %arrayidx47, align 8, !tbaa !0
+  tail call fastcc void @CleanNet(i64 %36)
+  br label %for.inc
+
+if.else48:                                        ; preds = %land.lhs.true40, %land.lhs.true38, %if.else36
+  %inc = add nsw i32 %numLeft.091, 1
+  br label %for.inc
+
+for.inc:                                          ; preds = %if.else48, %if.then44, %if.then32, %if.then20, %if.then9, %for.body
+  %numLeft.1 = phi i32 [ %numLeft.091, %if.then9 ], [ %numLeft.091, %if.then20 ], [ %numLeft.091, %if.then32 ], [ %numLeft.091, %if.then44 ], [ %inc, %if.else48 ], [ %numLeft.091, %for.body ]
+  %inc53 = add i64 %i.092, 1
+  %37 = load i64* @channelColumns, align 8, !tbaa !0
+  %cmp = icmp ugt i64 %inc53, %37
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.inc, %entry
+  %numLeft.0.lcssa = phi i32 [ 0, %entry ], [ %numLeft.1, %for.inc ]
+  ret i32 %numLeft.0.lcssa
+}
+
+; Materializable
+declare hidden fastcc i32 @Maze1Mech(i64, i64, i64, i64, i64, i32, i32) nounwind ssp
+
+; Materializable
+declare hidden fastcc void @CleanNet(i64) nounwind ssp
+
+!0 = metadata !{metadata !"long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/ARM64/ccmp.ll b/test/CodeGen/ARM64/ccmp.ll
new file mode 100644
index 0000000000..79e6f94e3f
--- /dev/null
+++ b/test/CodeGen/ARM64/ccmp.ll
@@ -0,0 +1,289 @@
+; RUN: llc < %s -mcpu=cyclone -verify-machineinstrs -arm64-ccmp -arm64-stress-ccmp | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; CHECK: single_same
+; CHECK: cmp w0, #5
+; CHECK-NEXT: ccmp w1, #17, #4, ne
+; CHECK-NEXT: b.ne
+; CHECK: %if.then
+; CHECK: bl _foo
+; CHECK: %if.end
+define i32 @single_same(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  %cmp1 = icmp eq i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Different condition codes for the two compares.
+; CHECK: single_different
+; CHECK: cmp w0, #6
+; CHECK-NEXT: ccmp w1, #17, #0, ge
+; CHECK-NEXT: b.eq
+; CHECK: %if.then
+; CHECK: bl _foo
+; CHECK: %if.end
+define i32 @single_different(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp sle i32 %a, 5
+  %cmp1 = icmp ne i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Second block clobbers the flags, can't convert (easily).
+; CHECK: single_flagclobber
+; CHECK: cmp
+; CHECK: b.eq
+; CHECK: cmp
+; CHECK: b.gt
+define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp1 = icmp slt i32 %b, 7
+  %mul = shl nsw i32 %b, 1
+  %add = add nsw i32 %b, 1
+  %cond = select i1 %cmp1, i32 %mul, i32 %add
+  %cmp2 = icmp slt i32 %cond, 17
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %lor.lhs.false
+  ret i32 7
+}
+
+; Second block clobbers the flags and ends with a tbz terminator.
+; CHECK: single_flagclobber_tbz
+; CHECK: cmp
+; CHECK: b.eq
+; CHECK: cmp
+; CHECK: tbz
+define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  br i1 %cmp, label %if.then, label %lor.lhs.false
+
+lor.lhs.false:                                    ; preds = %entry
+  %cmp1 = icmp slt i32 %b, 7
+  %mul = shl nsw i32 %b, 1
+  %add = add nsw i32 %b, 1
+  %cond = select i1 %cmp1, i32 %mul, i32 %add
+  %and = and i32 %cond, 8
+  %cmp2 = icmp ne i32 %and, 0
+  br i1 %cmp2, label %if.then, label %if.end
+
+if.then:                                          ; preds = %lor.lhs.false, %entry
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %lor.lhs.false
+  ret i32 7
+}
+
+; Speculatively execute division by zero.
+; The sdiv/udiv instructions do not trap when the divisor is zero, so they are
+; safe to speculate.
+; CHECK: speculate_division
+; CHECK-NOT: cmp
+; CHECK: sdiv
+; CHECK: cmp
+; CHECK-NEXT: ccmp
+define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %div = sdiv i32 %b, %a
+  %cmp1 = icmp slt i32 %div, 17
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Floating point compare.
+; CHECK: single_fcmp
+; CHECK: cmp
+; CHECK-NOT: b.
+; CHECK: fccmp {{.*}}, #8, ge
+; CHECK: b.lt
+define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, 0
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %conv = sitofp i32 %a to float
+  %div = fdiv float %b, %conv
+  %cmp1 = fcmp oge float %div, 1.700000e+01
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Chain multiple compares.
+; CHECK: multi_different
+; CHECK: cmp
+; CHECK: ccmp
+; CHECK: ccmp
+; CHECK: b.
+define void @multi_different(i32 %a, i32 %b, i32 %c) nounwind ssp {
+entry:
+  %cmp = icmp sgt i32 %a, %b
+  br i1 %cmp, label %land.lhs.true, label %if.end
+
+land.lhs.true:
+  %div = sdiv i32 %b, %a
+  %cmp1 = icmp eq i32 %div, 5
+  %cmp4 = icmp sgt i32 %div, %c
+  %or.cond = and i1 %cmp1, %cmp4
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+; Convert a cbz in the head block.
+; CHECK: cbz_head
+; CHECK: cmp w0, #0
+; CHECK: ccmp
+define i32 @cbz_head(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp ne i32 %b, 17
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Check that the immediate operand is in range. The ccmp instruction encodes a
+; smaller range of immediates than subs/adds.
+; The ccmp immediates must be in the range 0-31.
+; CHECK: immediate_range
+; CHECK-NOT: ccmp
+define i32 @immediate_range(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 5
+  %cmp1 = icmp eq i32 %b, 32
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Convert a cbz in the second block.
+; CHECK: cbz_second
+; CHECK: cmp w0, #0
+; CHECK: ccmp w1, #0, #0, ne
+; CHECK: b.eq
+define i32 @cbz_second(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp ne i32 %b, 0
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+
+; Convert a cbnz in the second block.
+; CHECK: cbnz_second
+; CHECK: cmp w0, #0
+; CHECK: ccmp w1, #0, #4, ne
+; CHECK: b.ne
+define i32 @cbnz_second(i32 %a, i32 %b) nounwind ssp {
+entry:
+  %cmp = icmp eq i32 %a, 0
+  %cmp1 = icmp eq i32 %b, 0
+  %or.cond = or i1 %cmp, %cmp1
+  br i1 %or.cond, label %if.then, label %if.end
+
+if.then:
+  %call = tail call i32 @foo() nounwind
+  br label %if.end
+
+if.end:
+  ret i32 7
+}
+declare i32 @foo()
+
+%str1 = type { %str2 }
+%str2 = type { [24 x i8], i8*, i32, %str1*, i32, [4 x i8], %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, %str1*, i8*, i8, i8*, %str1*, i8* }
+
+; Test case distilled from 126.gcc.
+; The phi in sw.bb.i.i gets multiple operands for the %entry predecessor.
+; CHECK: build_modify_expr
+define void @build_modify_expr() nounwind ssp {
+entry:
+  switch i32 undef, label %sw.bb.i.i [
+    i32 69, label %if.end85
+    i32 70, label %if.end85
+    i32 71, label %if.end85
+    i32 72, label %if.end85
+    i32 73, label %if.end85
+    i32 105, label %if.end85
+    i32 106, label %if.end85
+  ]
+
+if.end85:
+  ret void
+
+sw.bb.i.i:
+  %ref.tr.i.i = phi %str1* [ %0, %sw.bb.i.i ], [ undef, %entry ]
+  %operands.i.i = getelementptr inbounds %str1* %ref.tr.i.i, i64 0, i32 0, i32 2
+  %arrayidx.i.i = bitcast i32* %operands.i.i to %str1**
+  %0 = load %str1** %arrayidx.i.i, align 8
+  %code1.i.i.phi.trans.insert = getelementptr inbounds %str1* %0, i64 0, i32 0, i32 0, i64 16
+  br label %sw.bb.i.i
+}
diff --git a/test/CodeGen/ARM64/coalesce-ext.ll b/test/CodeGen/ARM64/coalesce-ext.ll
new file mode 100644
index 0000000000..9e8d08e055
--- /dev/null
+++ b/test/CodeGen/ARM64/coalesce-ext.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-darwin < %s | FileCheck %s
+; Check that the peephole optimizer knows about sext and zext instructions.
+; CHECK: test1sext
+define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
+  %C = add i64 %A, %B
+  ; CHECK: add x[[SUM:[0-9]+]], x0, x1
+  %D = trunc i64 %C to i32
+  %E = shl i64 %C, 32
+  %F = ashr i64 %E, 32
+  ; CHECK: sxtw x[[EXT:[0-9]+]], x[[SUM]]
+  store volatile i64 %F, i64 *%P2
+  ; CHECK: str x[[EXT]]
+  store volatile i32 %D, i32* %P
+  ; Reuse low bits of extended register, don't extend live range of SUM.
+  ; CHECK: str w[[SUM]]
+  ret i32 %D
+}
diff --git a/test/CodeGen/ARM64/code-model-large-abs.ll b/test/CodeGen/ARM64/code-model-large-abs.ll
new file mode 100644
index 0000000000..264da2da25
--- /dev/null
+++ b/test/CodeGen/ARM64/code-model-large-abs.ll
@@ -0,0 +1,72 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -code-model=large < %s | FileCheck %s
+
+@var8 = global i8 0
+@var16 = global i16 0
+@var32 = global i32 0
+@var64 = global i64 0
+
+define i8* @global_addr() {
+; CHECK-LABEL: global_addr:
+  ret i8* @var8
+  ; The movz/movk calculation should end up returned directly in x0.
+; CHECK: movz x0, #:abs_g3:var8
+; CHECK: movk x0, #:abs_g2_nc:var8
+; CHECK: movk x0, #:abs_g1_nc:var8
+; CHECK: movk x0, #:abs_g0_nc:var8
+; CHECK-NEXT: ret
+}
+
+define i8 @global_i8() {
+; CHECK-LABEL: global_i8:
+  %val = load i8* @var8
+  ret i8 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var8
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var8
+; CHECK: ldrb w0, [x[[ADDR_REG]]]
+}
+
+define i16 @global_i16() {
+; CHECK-LABEL: global_i16:
+  %val = load i16* @var16
+  ret i16 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var16
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var16
+; CHECK: ldrh w0, [x[[ADDR_REG]]]
+}
+
+define i32 @global_i32() {
+; CHECK-LABEL: global_i32:
+  %val = load i32* @var32
+  ret i32 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var32
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var32
+; CHECK: ldr w0, [x[[ADDR_REG]]]
+}
+
+define i64 @global_i64() {
+; CHECK-LABEL: global_i64:
+  %val = load i64* @var64
+  ret i64 %val
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:var64
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:var64
+; CHECK: ldr x0, [x[[ADDR_REG]]]
+}
+
+define <2 x i64> @constpool() {
+; CHECK-LABEL: constpool:
+  ret <2 x i64> <i64 123456789, i64 987654321100>
+
+; CHECK: movz x[[ADDR_REG:[0-9]+]], #:abs_g3:[[CPADDR:.LCPI[0-9]+_[0-9]+]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g2_nc:[[CPADDR]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g1_nc:[[CPADDR]]
+; CHECK: movk x[[ADDR_REG]], #:abs_g0_nc:[[CPADDR]]
+; CHECK: ldr q0, [x[[ADDR_REG]]]
+}
diff --git a/test/CodeGen/ARM64/collect-loh-garbage-crash.ll b/test/CodeGen/ARM64/collect-loh-garbage-crash.ll
new file mode 100644
index 0000000000..98cb625d2d
--- /dev/null
+++ b/test/CodeGen/ARM64/collect-loh-garbage-crash.ll
@@ -0,0 +1,37 @@
+; RUN: llc -mtriple=arm64-apple-ios -O3 -arm64-collect-loh -arm64-collect-loh-bb-only=true -arm64-collect-loh-pre-collect-register=false < %s -o - | FileCheck %s
+; Check that the LOH analysis does not crash when the analysed chained
+; contains instructions that are filtered out.
+;
+; Before the fix for <rdar://problem/16041712>, these cases were removed
+; from the main container. Now, the deterministic container does not allow
+; to remove arbitrary values, so we have to live with garbage values.
+; <rdar://problem/16041712>
+
+%"class.H4ISP::H4ISPDevice" = type { i32 (%"class.H4ISP::H4ISPDevice"*, i32, i8*, i8*)*, i8*, i32*, %"class.H4ISP::H4ISPCameraManager"* }
+
+%"class.H4ISP::H4ISPCameraManager" = type opaque
+
+declare i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"*)
+
+@pH4ISPDevice = hidden global %"class.H4ISP::H4ISPDevice"* null, align 8
+
+; CHECK-LABEL: _foo:
+; CHECK: ret
+; CHECK-NOT: .loh AdrpLdrGotLdr
+define void @foo() {
+entry:
+  br label %if.then83
+if.then83:                                        ; preds = %if.end81
+  %tmp = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
+  %call84 = call i32 @_ZN5H4ISP11H4ISPDevice32ISP_SelectBestMIPIFrequencyIndexEjPj(%"class.H4ISP::H4ISPDevice"* %tmp) #19
+  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27}"()
+  %tmp2 = load %"class.H4ISP::H4ISPDevice"** @pH4ISPDevice, align 8
+  tail call void asm sideeffect "", "~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x28}"()
+  %pCameraManager.i268 = getelementptr inbounds %"class.H4ISP::H4ISPDevice"* %tmp2, i64 0, i32 3
+  %tmp3 = load %"class.H4ISP::H4ISPCameraManager"** %pCameraManager.i268, align 8
+  %tobool.i269 = icmp eq %"class.H4ISP::H4ISPCameraManager"* %tmp3, null
+  br i1 %tobool.i269, label %if.then83, label %end
+end:
+  ret void
+}
+
diff --git a/test/CodeGen/ARM64/collect-loh-str.ll b/test/CodeGen/ARM64/collect-loh-str.ll
new file mode 100644
index 0000000000..fc63f8bcc2
--- /dev/null
+++ b/test/CodeGen/ARM64/collect-loh-str.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+; Test case for <rdar://problem/15942912>.
+; AdrpAddStr cannot be used when the store uses same
+; register as address and value. Indeed, the related
+; if applied, may completely remove the definition or
+; at least provide a wrong one (with the offset folded
+; into the definition).
+
+%struct.anon = type { i32*, i32** }
+
+@pptp_wan_head = internal global %struct.anon zeroinitializer, align 8
+
+; CHECK-LABEL: _pptp_wan_init
+; CHECK: ret
+; CHECK-NOT: AdrpAddStr
+define i32 @pptp_wan_init() {
+entry:
+  store i32* null, i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), align 8
+  store i32** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 0), i32*** getelementptr inbounds (%struct.anon* @pptp_wan_head, i64 0, i32 1), align 8
+  ret i32 0
+}
+
+
diff --git a/test/CodeGen/ARM64/collect-loh.ll b/test/CodeGen/ARM64/collect-loh.ll
new file mode 100644
index 0000000000..08ab0620b8
--- /dev/null
+++ b/test/CodeGen/ARM64/collect-loh.ll
@@ -0,0 +1,47 @@
+; RUN: llc -mtriple=arm64-apple-ios -O2 -arm64-collect-loh -arm64-collect-loh-bb-only=false < %s -o - | FileCheck %s
+
+@a = internal unnamed_addr global i32 0, align 4
+@b = external global i32
+
+; Function Attrs: noinline nounwind ssp
+define void @foo(i32 %t) {
+entry:
+  %tmp = load i32* @a, align 4
+  %add = add nsw i32 %tmp, %t
+  store i32 %add, i32* @a, align 4
+  ret void
+}
+
+; Function Attrs: nounwind ssp
+; Testcase for <rdar://problem/15438605>, AdrpAdrp reuse is valid only when the first adrp
+; dominates the second.
+; The first adrp comes from the loading of 'a' and the second the loading of 'b'.
+; 'a' is loaded in if.then, 'b' in if.end4, if.then does not dominates if.end4.
+; CHECK-LABEL: _test
+; CHECK: ret
+; CHECK-NOT: .loh AdrpAdrp
+define i32 @test(i32 %t) {
+entry:
+  %cmp = icmp sgt i32 %t, 5
+  br i1 %cmp, label %if.then, label %if.end4
+
+if.then:                                          ; preds = %entry
+  %tmp = load i32* @a, align 4
+  %add = add nsw i32 %tmp, %t
+  %cmp1 = icmp sgt i32 %add, 12
+  br i1 %cmp1, label %if.then2, label %if.end4
+
+if.then2:                                         ; preds = %if.then
+  tail call void @foo(i32 %add)
+  %tmp1 = load i32* @a, align 4
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.then2, %if.then, %entry
+  %t.addr.0 = phi i32 [ %tmp1, %if.then2 ], [ %t, %if.then ], [ %t, %entry ]
+  %tmp2 = load i32* @b, align 4
+  %add5 = add nsw i32 %tmp2, %t.addr.0
+  tail call void @foo(i32 %add5)
+  %tmp3 = load i32* @b, align 4
+  %add6 = add nsw i32 %tmp3, %t.addr.0
+  ret i32 %add6
+}
diff --git a/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S b/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
new file mode 100644
index 0000000000..250732d6e8
--- /dev/null
+++ b/test/CodeGen/ARM64/compact-unwind-unhandled-cfi.S
@@ -0,0 +1,17 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o /dev/null %s
+
+        .text
+        .globl _foo
+        .cfi_startproc
+_foo:
+        stp x29, x30, [sp, #-16]!
+ .cfi_adjust_cfa_offset 16
+
+        ldp x29, x30, [sp], #16
+ .cfi_adjust_cfa_offset -16
+        .cfi_restore x29
+        .cfi_restore x30
+
+        ret
+
+        .cfi_endproc
diff --git a/test/CodeGen/ARM64/complex-ret.ll b/test/CodeGen/ARM64/complex-ret.ll
new file mode 100644
index 0000000000..93d50a5986
--- /dev/null
+++ b/test/CodeGen/ARM64/complex-ret.ll
@@ -0,0 +1,7 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+define { i192, i192, i21, i192 } @foo(i192) {
+; CHECK-LABEL: foo:
+; CHECK: stp xzr, xzr, [x8]
+  ret { i192, i192, i21, i192 } {i192 0, i192 1, i21 2, i192 3}
+}
diff --git a/test/CodeGen/ARM64/convert-v2f64-v2i32.ll b/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
new file mode 100644
index 0000000000..1a07c98655
--- /dev/null
+++ b/test/CodeGen/ARM64/convert-v2f64-v2i32.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+; CHECK: fptosi_1
+; CHECK: fcvtzs.2d
+; CHECK: xtn.2s
+; CHECK: ret
+define void @fptosi_1() nounwind noinline ssp {
+entry:
+  %0 = fptosi <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
+
+; CHECK: fptoui_1
+; CHECK: fcvtzu.2d
+; CHECK: xtn.2s
+; CHECK: ret
+define void @fptoui_1() nounwind noinline ssp {
+entry:
+  %0 = fptoui <2 x double> undef to <2 x i32>
+  store <2 x i32> %0, <2 x i32>* undef, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/ARM64/convert-v2i32-v2f64.ll b/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
new file mode 100644
index 0000000000..63129a4b83
--- /dev/null
+++ b/test/CodeGen/ARM64/convert-v2i32-v2f64.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x double> @f1(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: f1:
+; CHECK: sshll.2d v0, v0, #0
+; CHECK-NEXT: scvtf.2d v0, v0
+; CHECK-NEXT: ret
+  %conv = sitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+define <2 x double> @f2(<2 x i32> %v) nounwind readnone {
+; CHECK-LABEL: f2:
+; CHECK: ushll.2d v0, v0, #0
+; CHECK-NEXT: ucvtf.2d v0, v0
+; CHECK-NEXT: ret
+  %conv = uitofp <2 x i32> %v to <2 x double>
+  ret <2 x double> %conv
+}
+
+; CHECK: autogen_SD19655
+; CHECK: scvtf
+; CHECK: ret
+define void @autogen_SD19655() {
+  %T = load <2 x i64>* undef
+  %F = sitofp <2 x i64> undef to <2 x float>
+  store <2 x float> %F, <2 x float>* undef
+  ret void
+}
+
diff --git a/test/CodeGen/ARM64/copy-tuple.ll b/test/CodeGen/ARM64/copy-tuple.ll
new file mode 100644
index 0000000000..6325c3f855
--- /dev/null
+++ b/test/CodeGen/ARM64/copy-tuple.ll
@@ -0,0 +1,146 @@
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s
+
+; The main purpose of this test is to find out whether copyPhysReg can deal with
+; the memmove-like situation arising in tuples, where an early copy can clobber
+; the value needed by a later one if the tuples overlap.
+
+; We use dummy inline asm to force LLVM to generate a COPY between the registers
+; we want by clobbering all the others.
+
+define void @test_D1D2_from_D0D1(i8* %addr) #0 {
+; CHECK-LABEL: test_D1D2_from_D0D1:
+; CHECK: orr.8b v2, v1
+; CHECK: orr.8b v1, v0
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D0D1_from_D1D2(i8* %addr) #0 {
+; CHECK-LABEL: test_D0D1_from_D1D2:
+; CHECK: orr.8b v0, v1
+; CHECK: orr.8b v1, v2
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v0},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D0D1_from_D31D0(i8* %addr) #0 {
+; CHECK-LABEL: test_D0D1_from_D31D0:
+; CHECK: orr.8b v1, v0
+; CHECK: orr.8b v0, v31
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D31D0_from_D0D1(i8* %addr) #0 {
+; CHECK-LABEL: test_D31D0_from_D0D1:
+; CHECK: orr.8b v31, v0
+; CHECK: orr.8b v0, v1
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8> } %vec, 1
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30}"()
+  tail call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, i8* %addr)
+  ret void
+}
+
+define void @test_D2D3D4_from_D0D1D2(i8* %addr) #0 {
+; CHECK-LABEL: test_D2D3D4_from_D0D1D2:
+; CHECK: orr.8b v4, v2
+; CHECK: orr.8b v3, v1
+; CHECK: orr.8b v2, v0
+entry:
+  %addr_v8i8 = bitcast i8* %addr to <8 x i8>*
+  %vec = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>* %addr_v8i8)
+  %vec0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 0
+  %vec1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 1
+  %vec2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vec, 2
+
+  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v1},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %vec0, <8 x i8> %vec1, <8 x i8> %vec2, i8* %addr)
+  ret void
+}
+
+define void @test_Q0Q1Q2_from_Q1Q2Q3(i8* %addr) #0 {
+; CHECK-LABEL: test_Q0Q1Q2_from_Q1Q2Q3:
+; CHECK: orr.16b v0, v1
+; CHECK: orr.16b v1, v2
+; CHECK: orr.16b v2, v3
+entry:
+  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
+  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
+  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
+  tail call void asm sideeffect "", "~{v0},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, i8* %addr)
+  ret void
+}
+
+define void @test_Q1Q2Q3Q4_from_Q30Q31Q0Q1(i8* %addr) #0 {
+; CHECK-LABEL: test_Q1Q2Q3Q4_from_Q30Q31Q0Q1:
+; CHECK: orr.16b v4, v1
+; CHECK: orr.16b v3, v0
+; CHECK: orr.16b v2, v31
+; CHECK: orr.16b v1, v30
+  %addr_v16i8 = bitcast i8* %addr to <16 x i8>*
+  %vec = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>* %addr_v16i8)
+  %vec0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 0
+  %vec1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 1
+  %vec2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 2
+  %vec3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vec, 3
+
+  tail call void asm sideeffect "", "~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}"()
+  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+
+  tail call void asm sideeffect "", "~{v0},~{v5},~{v6},~{v7},~{v8},~{v9},~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19},~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29},~{v30},~{v31}"()
+  tail call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %vec0, <16 x i8> %vec1, <16 x i8> %vec2, <16 x i8> %vec3, i8* %addr)
+  ret void
+}
+
+declare { <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld2.v8i8.p0v8i8(<8 x i8>*)
+declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm64.neon.ld3.v8i8.p0v8i8(<8 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld3.v16i8.p0v16i8(<16 x i8>*)
+declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm64.neon.ld4.v16i8.p0v16i8(<16 x i8>*)
+
+declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*)
+declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*)
+declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*)
diff --git a/test/CodeGen/ARM64/crc32.ll b/test/CodeGen/ARM64/crc32.ll
new file mode 100644
index 0000000000..609eb44122
--- /dev/null
+++ b/test/CodeGen/ARM64/crc32.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=arm64 -o - %s | FileCheck %s
+
+define i32 @test_crc32b(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32b:
+; CHECK: crc32b w0, w0, w1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.arm64.crc32b(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32h(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32h:
+; CHECK: crc32h w0, w0, w1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.arm64.crc32h(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32w(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32w:
+; CHECK: crc32w w0, w0, w1
+  %val = call i32 @llvm.arm64.crc32w(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32x(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32x:
+; CHECK: crc32x w0, w0, x1
+  %val = call i32 @llvm.arm64.crc32x(i32 %cur, i64 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32cb(i32 %cur, i8 %next) {
+; CHECK-LABEL: test_crc32cb:
+; CHECK: crc32cb w0, w0, w1
+  %bits = zext i8 %next to i32
+  %val = call i32 @llvm.arm64.crc32cb(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32ch(i32 %cur, i16 %next) {
+; CHECK-LABEL: test_crc32ch:
+; CHECK: crc32ch w0, w0, w1
+  %bits = zext i16 %next to i32
+  %val = call i32 @llvm.arm64.crc32ch(i32 %cur, i32 %bits)
+  ret i32 %val
+}
+
+define i32 @test_crc32cw(i32 %cur, i32 %next) {
+; CHECK-LABEL: test_crc32cw:
+; CHECK: crc32cw w0, w0, w1
+  %val = call i32 @llvm.arm64.crc32cw(i32 %cur, i32 %next)
+  ret i32 %val
+}
+
+define i32 @test_crc32cx(i32 %cur, i64 %next) {
+; CHECK-LABEL: test_crc32cx:
+; CHECK: crc32cx w0, w0, x1
+  %val = call i32 @llvm.arm64.crc32cx(i32 %cur, i64 %next)
+  ret i32 %val
+}
+
+declare i32 @llvm.arm64.crc32b(i32, i32)
+declare i32 @llvm.arm64.crc32h(i32, i32)
+declare i32 @llvm.arm64.crc32w(i32, i32)
+declare i32 @llvm.arm64.crc32x(i32, i64)
+
+declare i32 @llvm.arm64.crc32cb(i32, i32)
+declare i32 @llvm.arm64.crc32ch(i32, i32)
+declare i32 @llvm.arm64.crc32cw(i32, i32)
+declare i32 @llvm.arm64.crc32cx(i32, i64)
diff --git a/test/CodeGen/ARM64/crypto.ll b/test/CodeGen/ARM64/crypto.ll
new file mode 100644
index 0000000000..3804310287
--- /dev/null
+++ b/test/CodeGen/ARM64/crypto.ll
@@ -0,0 +1,135 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple -o - %s | FileCheck %s
+
+declare <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+declare <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
+declare <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
+
+define <16 x i8> @test_aese(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aese:
+; CHECK: aese.16b v0, v1
+  %res = call <16 x i8> @llvm.arm64.crypto.aese(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesd(<16 x i8> %data, <16 x i8> %key) {
+; CHECK-LABEL: test_aesd:
+; CHECK: aesd.16b v0, v1
+  %res = call <16 x i8> @llvm.arm64.crypto.aesd(<16 x i8> %data, <16 x i8> %key)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesmc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesmc:
+; CHECK: aesmc.16b v0, v0
+ %res = call <16 x i8> @llvm.arm64.crypto.aesmc(<16 x i8> %data)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_aesimc(<16 x i8> %data) {
+; CHECK-LABEL: test_aesimc:
+; CHECK: aesimc.16b v0, v0
+ %res = call <16 x i8> @llvm.arm64.crypto.aesimc(<16 x i8> %data)
+  ret <16 x i8> %res
+}
+
+declare <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+declare i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
+declare <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+declare <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+
+define <4 x i32> @test_sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+; <rdar://problem/14742333> Incomplete removal of unnecessary FMOV instructions in intrinsic SHA1
+define <4 x i32> @test_sha1c_in_a_row(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1c_in_a_row:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1c.4s q[[SHA1RES:[0-9]+]], [[HASH_E]], v1
+; CHECK-NOT: fmov
+; CHECK: sha1c.4s q0, s[[SHA1RES]], v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  %extract = extractelement <4 x i32> %res, i32 0
+  %res2 = call <4 x i32> @llvm.arm64.crypto.sha1c(<4 x i32> %hash_abcd, i32 %extract, <4 x i32> %wk)
+  ret <4 x i32> %res2
+}
+
+define <4 x i32> @test_sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1p:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1p.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1p(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha1m:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1m.4s q0, [[HASH_E]], v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1m(<4 x i32> %hash_abcd, i32 %hash_e, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define i32 @test_sha1h(i32 %hash_e) {
+; CHECK-LABEL: test_sha1h:
+; CHECK: fmov [[HASH_E:s[0-9]+]], w0
+; CHECK: sha1h [[RES:s[0-9]+]], [[HASH_E]]
+; CHECK: fmov w0, [[RES]]
+  %res = call i32 @llvm.arm64.crypto.sha1h(i32 %hash_e)
+  ret i32 %res
+}
+
+define <4 x i32> @test_sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11) {
+; CHECK-LABEL: test_sha1su0:
+; CHECK: sha1su0.4s v0, v1, v2
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1su0(<4 x i32> %wk0_3, <4 x i32> %wk4_7, <4 x i32> %wk8_11)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15) {
+; CHECK-LABEL: test_sha1su1:
+; CHECK: sha1su1.4s v0, v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha1su1(<4 x i32> %wk0_3, <4 x i32> %wk12_15)
+  ret <4 x i32> %res
+}
+
+declare <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+declare <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+declare <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+declare <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+
+define <4 x i32> @test_sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h:
+; CHECK: sha256h.4s q0, q1, v2
+  %res = call <4 x i32> @llvm.arm64.crypto.sha256h(<4 x i32> %hash_abcd, <4 x i32> %hash_efgh, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk) {
+; CHECK-LABEL: test_sha256h2:
+; CHECK: sha256h2.4s q0, q1, v2
+
+  %res = call <4 x i32> @llvm.arm64.crypto.sha256h2(<4 x i32> %hash_efgh, <4 x i32> %hash_abcd, <4 x i32> %wk)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7) {
+; CHECK-LABEL: test_sha256su0:
+; CHECK: sha256su0.4s v0, v1
+  %res = call <4 x i32> @llvm.arm64.crypto.sha256su0(<4 x i32> %w0_3, <4 x i32> %w4_7)
+  ret <4 x i32> %res
+}
+
+define <4 x i32> @test_sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15) {
+; CHECK-LABEL: test_sha256su1:
+; CHECK: sha256su1.4s v0, v1, v2
+  %res = call <4 x i32> @llvm.arm64.crypto.sha256su1(<4 x i32> %w0_3, <4 x i32> %w8_11, <4 x i32> %w12_15)
+  ret <4 x i32> %res
+}
diff --git a/test/CodeGen/ARM64/cse.ll b/test/CodeGen/ARM64/cse.ll
new file mode 100644
index 0000000000..d98bfd6053
--- /dev/null
+++ b/test/CodeGen/ARM64/cse.ll
@@ -0,0 +1,59 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target triple = "arm64-apple-ios"
+
+; rdar://12462006
+; CSE between "icmp reg reg" and "sub reg reg".
+; Both can be in the same basic block or in different basic blocks.
+define i8* @t1(i8* %base, i32* nocapture %offset, i32 %size) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: subs
+; CHECK-NOT: cmp
+; CHECK-NOT: sub
+; CHECK: b.ge
+; CHECK: sub
+; CHECK: sub
+; CHECK_NOT: sub
+; CHECK: ret
+ %0 = load i32* %offset, align 4
+ %cmp = icmp slt i32 %0, %size
+ %s = sub nsw i32 %0, %size
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %sub = sub nsw i32 %0, %size
+ %s2 = sub nsw i32 %s, %size
+ %s3 = sub nsw i32 %sub, %s2
+ store i32 %s3, i32* %offset, align 4
+ %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+ br label %return
+
+return:
+ %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+ ret i8* %retval.0
+}
+
+; CSE between "icmp reg imm" and "sub reg imm".
+define i8* @t2(i8* %base, i32* nocapture %offset) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: subs
+; CHECK-NOT: cmp
+; CHECK-NOT: sub
+; CHECK: b.lt
+; CHECK-NOT: sub
+; CHECK: ret
+ %0 = load i32* %offset, align 4
+ %cmp = icmp slt i32 %0, 1
+ br i1 %cmp, label %return, label %if.end
+
+if.end:
+ %sub = sub nsw i32 %0, 1
+ store i32 %sub, i32* %offset, align 4
+ %add.ptr = getelementptr inbounds i8* %base, i32 %sub
+ br label %return
+
+return:
+ %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
+ ret i8* %retval.0
+}
diff --git a/test/CodeGen/ARM64/csel.ll b/test/CodeGen/ARM64/csel.ll
new file mode 100644
index 0000000000..cbf1769897
--- /dev/null
+++ b/test/CodeGen/ARM64/csel.ll
@@ -0,0 +1,222 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-unknown-unknown"
+
+; CHECK: foo1
+; CHECK: csinc w{{[0-9]+}}, w[[REG:[0-9]+]],
+; CHECK:                                     w[[REG]], eq
+define i32 @foo1(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %not.tobool = icmp ne i32 %c, 0
+  %add = zext i1 %not.tobool to i32
+  %b.add = add i32 %c, %b
+  %add1 = add i32 %b.add, %add
+  ret i32 %add1
+}
+
+; CHECK: foo2
+; CHECK: csneg w{{[0-9]+}}, w[[REG:[0-9]+]],
+; CHECK:                                     w[[REG]], eq
+define i32 @foo2(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %mul = sub i32 0, %b
+  %tobool = icmp eq i32 %c, 0
+  %b.mul = select i1 %tobool, i32 %b, i32 %mul
+  %add = add nsw i32 %b.mul, %c
+  ret i32 %add
+}
+
+; CHECK: foo3
+; CHECK: csinv w{{[0-9]+}}, w[[REG:[0-9]+]],
+; CHECK:                                     w[[REG]], eq
+define i32 @foo3(i32 %b, i32 %c) nounwind readnone ssp {
+entry:
+  %not.tobool = icmp ne i32 %c, 0
+  %xor = sext i1 %not.tobool to i32
+  %b.xor = xor i32 %xor, %b
+  %add = add nsw i32 %b.xor, %c
+  ret i32 %add
+}
+
+; rdar://11632325
+define i32@foo4(i32 %a) nounwind ssp {
+; CHECK: foo4
+; CHECK: csneg
+; CHECK-NEXT: ret
+  %cmp = icmp sgt i32 %a, -1
+  %neg = sub nsw i32 0, %a
+  %cond = select i1 %cmp, i32 %a, i32 %neg
+  ret i32 %cond
+}
+
+define i32@foo5(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: foo5
+; CHECK: subs
+; CHECK-NEXT: csneg
+; CHECK-NEXT: ret
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, -1
+  %sub3 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub3
+  ret i32 %cond
+}
+
+; make sure we can handle branch instruction in optimizeCompare.
+define i32@foo6(i32 %a, i32 %b) nounwind ssp {
+; CHECK: foo6
+; CHECK: b
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, 0
+  br i1 %cmp, label %l.if, label %l.else
+
+l.if:
+  ret i32 1
+
+l.else:
+  ret i32 %sub
+}
+
+; If CPSR is used multiple times and V flag is used, we don't remove cmp.
+define i32 @foo7(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: foo7:
+; CHECK: sub
+; CHECK-next: adds
+; CHECK-next: csneg
+; CHECK-next: b
+  %sub = sub nsw i32 %a, %b
+  %cmp = icmp sgt i32 %sub, -1
+  %sub3 = sub nsw i32 0, %sub
+  %cond = select i1 %cmp, i32 %sub, i32 %sub3
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+  %cmp2 = icmp slt i32 %sub, -1
+  %sel = select i1 %cmp2, i32 %cond, i32 %a
+  ret i32 %sel
+
+if.else:
+  ret i32 %cond
+}
+
+define i32 @foo8(i32 %v, i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: foo8:
+; CHECK: cmp w0, #0
+; CHECK: csinv w0, w1, w2, ne
+  %tobool = icmp eq i32 %v, 0
+  %neg = xor i32 -1, %b
+  %cond = select i1 %tobool, i32 %neg, i32 %a
+  ret i32 %cond
+}
+
+define i32 @foo9(i32 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo9:
+; CHECK: cmp w0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: csinv w0, w[[REG]], w[[REG]], ne
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 4, i32 -5
+  ret i32 %cond
+}
+
+define i64 @foo10(i64 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo10:
+; CHECK: cmp x0, #0
+; CHECK: orr x[[REG:[0-9]+]], xzr, #0x4
+; CHECK: csinv x0, x[[REG]], x[[REG]], ne
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 4, i64 -5
+  ret i64 %cond
+}
+
+define i32 @foo11(i32 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo11:
+; CHECK: cmp w0, #0
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x4
+; CHECK: csneg w0, w[[REG]], w[[REG]], ne
+  %tobool = icmp ne i32 %v, 0
+  %cond = select i1 %tobool, i32 4, i32 -4
+  ret i32 %cond
+}
+
+define i64 @foo12(i64 %v) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo12:
+; CHECK: cmp x0, #0
+; CHECK: orr x[[REG:[0-9]+]], xzr, #0x4
+; CHECK: csneg x0, x[[REG]], x[[REG]], ne
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 4, i64 -4
+  ret i64 %cond
+}
+
+define i32 @foo13(i32 %v, i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo13:
+; CHECK: cmp w0, #0
+; CHECK: csneg w0, w1, w2, ne
+  %tobool = icmp eq i32 %v, 0
+  %sub = sub i32 0, %b
+  %cond = select i1 %tobool, i32 %sub, i32 %a
+  ret i32 %cond
+}
+
+define i64 @foo14(i64 %v, i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo14:
+; CHECK: cmp x0, #0
+; CHECK: csneg x0, x1, x2, ne
+  %tobool = icmp eq i64 %v, 0
+  %sub = sub i64 0, %b
+  %cond = select i1 %tobool, i64 %sub, i64 %a
+  ret i64 %cond
+}
+
+define i32 @foo15(i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo15:
+; CHECK: cmp w0, w1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: csinc w0, w[[REG]], w[[REG]], le
+  %cmp = icmp sgt i32 %a, %b
+  %. = select i1 %cmp, i32 2, i32 1
+  ret i32 %.
+}
+
+define i32 @foo16(i32 %a, i32 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo16:
+; CHECK: cmp w0, w1
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x1
+; CHECK: csinc w0, w[[REG]], w[[REG]], gt
+  %cmp = icmp sgt i32 %a, %b
+  %. = select i1 %cmp, i32 1, i32 2
+  ret i32 %.
+}
+
+define i64 @foo17(i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo17:
+; CHECK: cmp x0, x1
+; CHECK: orr x[[REG:[0-9]+]], xzr, #0x1
+; CHECK: csinc x0, x[[REG]], x[[REG]], le
+  %cmp = icmp sgt i64 %a, %b
+  %. = select i1 %cmp, i64 2, i64 1
+  ret i64 %.
+}
+
+define i64 @foo18(i64 %a, i64 %b) nounwind readnone optsize ssp {
+entry:
+; CHECK-LABEL: foo18:
+; CHECK: cmp x0, x1
+; CHECK: orr x[[REG:[0-9]+]], xzr, #0x1
+; CHECK: csinc x0, x[[REG]], x[[REG]], gt
+  %cmp = icmp sgt i64 %a, %b
+  %. = select i1 %cmp, i64 1, i64 2
+  ret i64 %.
+}
diff --git a/test/CodeGen/ARM64/cvt.ll b/test/CodeGen/ARM64/cvt.ll
new file mode 100644
index 0000000000..b55a42fdf8
--- /dev/null
+++ b/test/CodeGen/ARM64/cvt.ll
@@ -0,0 +1,401 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to away)
+;
+define i32 @fcvtas_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1s:
+;CHECK: fcvtas w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1s:
+;CHECK: fcvtas x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtas_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1w1d:
+;CHECK: fcvtas w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtas.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtas_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtas_1x1d:
+;CHECK: fcvtas x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtas.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtas.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtas.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtas.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtas.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer
+;
+define i32 @fcvtau_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1s:
+;CHECK: fcvtau w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1s:
+;CHECK: fcvtau x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtau_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1w1d:
+;CHECK: fcvtau w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtau.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtau_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtau_1x1d:
+;CHECK: fcvtau x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtau.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtau.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtau.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtau.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtau.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward -Inf)
+;
+define i32 @fcvtms_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1s:
+;CHECK: fcvtms w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1s:
+;CHECK: fcvtms x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtms_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1w1d:
+;CHECK: fcvtms w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtms.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtms_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtms_1x1d:
+;CHECK: fcvtms x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtms.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtms.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtms.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtms.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtms.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward -Inf)
+;
+define i32 @fcvtmu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1s:
+;CHECK: fcvtmu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1s:
+;CHECK: fcvtmu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtmu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1w1d:
+;CHECK: fcvtmu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtmu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtmu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtmu_1x1d:
+;CHECK: fcvtmu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtmu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtmu.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtmu.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtmu.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtmu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (to nearest with ties to even)
+;
+define i32 @fcvtns_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1s:
+;CHECK: fcvtns w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1s:
+;CHECK: fcvtns x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtns_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1w1d:
+;CHECK: fcvtns w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtns.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtns_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtns_1x1d:
+;CHECK: fcvtns x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtns.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtns.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtns.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtns.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtns.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (to nearest with ties to even)
+;
+define i32 @fcvtnu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1s:
+;CHECK: fcvtnu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1s:
+;CHECK: fcvtnu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtnu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1w1d:
+;CHECK: fcvtnu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtnu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtnu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtnu_1x1d:
+;CHECK: fcvtnu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtnu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtnu.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtnu.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtnu.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtnu.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to signed integer (toward +Inf)
+;
+define i32 @fcvtps_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1s:
+;CHECK: fcvtps w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1s:
+;CHECK: fcvtps x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtps_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1w1d:
+;CHECK: fcvtps w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtps.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtps_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtps_1x1d:
+;CHECK: fcvtps x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtps.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtps.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtps.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtps.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtps.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward +Inf)
+;
+define i32 @fcvtpu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1s:
+;CHECK: fcvtpu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1s:
+;CHECK: fcvtpu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtpu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1w1d:
+;CHECK: fcvtpu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtpu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtpu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtpu_1x1d:
+;CHECK: fcvtpu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtpu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtpu.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtpu.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtpu.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtpu.i64.f64(double) nounwind readnone
+
+;
+;  Floating-point scalar convert to signed integer (toward zero)
+;
+define i32 @fcvtzs_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1s:
+;CHECK: fcvtzs w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1s:
+;CHECK: fcvtzs x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtzs_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1w1d:
+;CHECK: fcvtzs w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtzs.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzs_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzs_1x1d:
+;CHECK: fcvtzs x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtzs.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtzs.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtzs.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtzs.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtzs.i64.f64(double) nounwind readnone
+
+;
+; Floating-point scalar convert to unsigned integer (toward zero)
+;
+define i32 @fcvtzu_1w1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1s:
+;CHECK: fcvtzu w0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f32(float %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1s(float %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1s:
+;CHECK: fcvtzu x0, s0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f32(float %A)
+	ret i64 %tmp3
+}
+
+define i32 @fcvtzu_1w1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1w1d:
+;CHECK: fcvtzu w0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i32 @llvm.arm64.neon.fcvtzu.i32.f64(double %A)
+	ret i32 %tmp3
+}
+
+define i64 @fcvtzu_1x1d(double %A) nounwind {
+;CHECK-LABEL: fcvtzu_1x1d:
+;CHECK: fcvtzu x0, d0
+;CHECK-NEXT: ret
+	%tmp3 = call i64 @llvm.arm64.neon.fcvtzu.i64.f64(double %A)
+	ret i64 %tmp3
+}
+
+declare i32 @llvm.arm64.neon.fcvtzu.i32.f32(float) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtzu.i64.f32(float) nounwind readnone
+declare i32 @llvm.arm64.neon.fcvtzu.i32.f64(double) nounwind readnone
+declare i64 @llvm.arm64.neon.fcvtzu.i64.f64(double) nounwind readnone
diff --git a/test/CodeGen/ARM64/dagcombiner-convergence.ll b/test/CodeGen/ARM64/dagcombiner-convergence.ll
new file mode 100644
index 0000000000..a45e31320d
--- /dev/null
+++ b/test/CodeGen/ARM64/dagcombiner-convergence.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -o /dev/null
+; rdar://10795250
+; DAGCombiner should converge.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-macosx10.8.0"
+
+define i64 @foo(i128 %Params.coerce, i128 %SelLocs.coerce) {
+entry:
+  %tmp = lshr i128 %Params.coerce, 61
+  %.tr38.i = trunc i128 %tmp to i64
+  %mul.i = and i64 %.tr38.i, 4294967288
+  %tmp1 = lshr i128 %SelLocs.coerce, 62
+  %.tr.i = trunc i128 %tmp1 to i64
+  %mul7.i = and i64 %.tr.i, 4294967292
+  %add.i = add i64 %mul7.i, %mul.i
+  %conv.i.i = and i64 %add.i, 4294967292
+  ret i64 %conv.i.i
+}
diff --git a/test/CodeGen/ARM64/dagcombiner-load-slicing.ll b/test/CodeGen/ARM64/dagcombiner-load-slicing.ll
new file mode 100644
index 0000000000..0679014e59
--- /dev/null
+++ b/test/CodeGen/ARM64/dagcombiner-load-slicing.ll
@@ -0,0 +1,102 @@
+; RUN: llc -mtriple arm64-apple-ios -O3 -o - < %s | FileCheck %s
+; <rdar://problem/14477220>
+
+%class.Complex = type { float, float }
+%class.Complex_int = type { i32, i32 }
+%class.Complex_long = type { i64, i64 }
+
+; CHECK-LABEL: @test
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
+; CHECK: ldp [[CPLX1_I:s[0-9]+]], [[CPLX1_R:s[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:s[0-9]+]], [[CPLX2_R:s[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: fadd {{s[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: fadd {{s[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test(%class.Complex* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex* %out, i64 %out_start
+  %0 = bitcast %class.Complex* %arrayidx to i64*
+  %1 = load i64* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
+  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to float
+  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
+  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to float
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 0
+  %4 = load float* %i.i, align 4
+  %add.i = fadd float %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x float> undef, float %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex* %arrayidx2, i64 0, i32 1
+  %5 = load float* %r.i, align 4
+  %add5.i = fadd float %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x float> %retval.sroa.0.0.vec.insert.i, float %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex* %arrayidx to <2 x float>*
+  store <2 x float> %retval.sroa.0.4.vec.insert.i, <2 x float>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
+
+; CHECK-LABEL: @test_int
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #3
+; CHECK: ldp [[CPLX1_I:w[0-9]+]], [[CPLX1_R:w[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:w[0-9]+]], [[CPLX2_R:w[0-9]+]], {{\[}}[[BASE]], #64]
+; CHECK: add {{w[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: add {{w[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test_int(%class.Complex_int* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex_int* %out, i64 %out_start
+  %0 = bitcast %class.Complex_int* %arrayidx to i64*
+  %1 = load i64* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i64 %1 to i32
+  %2 = bitcast i32 %t0.sroa.0.0.extract.trunc to i32
+  %t0.sroa.2.0.extract.shift = lshr i64 %1, 32
+  %t0.sroa.2.0.extract.trunc = trunc i64 %t0.sroa.2.0.extract.shift to i32
+  %3 = bitcast i32 %t0.sroa.2.0.extract.trunc to i32
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex_int* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 0
+  %4 = load i32* %i.i, align 4
+  %add.i = add i32 %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i32> undef, i32 %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex_int* %arrayidx2, i64 0, i32 1
+  %5 = load i32* %r.i, align 4
+  %add5.i = add i32 %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i32> %retval.sroa.0.0.vec.insert.i, i32 %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_int* %arrayidx to <2 x i32>*
+  store <2 x i32> %retval.sroa.0.4.vec.insert.i, <2 x i32>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
+
+; CHECK-LABEL: @test_long
+; CHECK: add [[BASE:x[0-9]+]], x0, x1, lsl #4
+; CHECK: ldp [[CPLX1_I:x[0-9]+]], [[CPLX1_R:x[0-9]+]], {{\[}}[[BASE]]]
+; CHECK: ldp [[CPLX2_I:x[0-9]+]], [[CPLX2_R:x[0-9]+]], {{\[}}[[BASE]], #128]
+; CHECK: add {{x[0-9]+}}, [[CPLX2_I]], [[CPLX1_I]]
+; CHECK: add {{x[0-9]+}}, [[CPLX2_R]], [[CPLX1_R]]
+; CHECK: ret
+define void @test_long(%class.Complex_long* nocapture %out, i64 %out_start) {
+entry:
+  %arrayidx = getelementptr inbounds %class.Complex_long* %out, i64 %out_start
+  %0 = bitcast %class.Complex_long* %arrayidx to i128*
+  %1 = load i128* %0, align 4
+  %t0.sroa.0.0.extract.trunc = trunc i128 %1 to i64
+  %2 = bitcast i64 %t0.sroa.0.0.extract.trunc to i64
+  %t0.sroa.2.0.extract.shift = lshr i128 %1, 64
+  %t0.sroa.2.0.extract.trunc = trunc i128 %t0.sroa.2.0.extract.shift to i64
+  %3 = bitcast i64 %t0.sroa.2.0.extract.trunc to i64
+  %add = add i64 %out_start, 8
+  %arrayidx2 = getelementptr inbounds %class.Complex_long* %out, i64 %add
+  %i.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 0
+  %4 = load i64* %i.i, align 4
+  %add.i = add i64 %4, %2
+  %retval.sroa.0.0.vec.insert.i = insertelement <2 x i64> undef, i64 %add.i, i32 0
+  %r.i = getelementptr inbounds %class.Complex_long* %arrayidx2, i32 0, i32 1
+  %5 = load i64* %r.i, align 4
+  %add5.i = add i64 %5, %3
+  %retval.sroa.0.4.vec.insert.i = insertelement <2 x i64> %retval.sroa.0.0.vec.insert.i, i64 %add5.i, i32 1
+  %ref.tmp.sroa.0.0.cast = bitcast %class.Complex_long* %arrayidx to <2 x i64>*
+  store <2 x i64> %retval.sroa.0.4.vec.insert.i, <2 x i64>* %ref.tmp.sroa.0.0.cast, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM64/dup.ll b/test/CodeGen/ARM64/dup.ll
new file mode 100644
index 0000000000..e65957522b
--- /dev/null
+++ b/test/CodeGen/ARM64/dup.ll
@@ -0,0 +1,322 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+define <8 x i8> @v_dup8(i8 %A) nounwind {
+;CHECK-LABEL: v_dup8:
+;CHECK: dup.8b
+	%tmp1 = insertelement <8 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <8 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <8 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <8 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <8 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <8 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <8 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <8 x i8> %tmp7, i8 %A, i32 7
+	ret <8 x i8> %tmp8
+}
+
+define <4 x i16> @v_dup16(i16 %A) nounwind {
+;CHECK-LABEL: v_dup16:
+;CHECK: dup.4h
+	%tmp1 = insertelement <4 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <4 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <4 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <4 x i16> %tmp3, i16 %A, i32 3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @v_dup32(i32 %A) nounwind {
+;CHECK-LABEL: v_dup32:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <2 x i32> %tmp1, i32 %A, i32 1
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_dupfloat(float %A) nounwind {
+;CHECK-LABEL: v_dupfloat:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <2 x float> %tmp1, float %A, i32 1
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_dupQ8(i8 %A) nounwind {
+;CHECK-LABEL: v_dupQ8:
+;CHECK: dup.16b
+	%tmp1 = insertelement <16 x i8> zeroinitializer, i8 %A, i32 0
+	%tmp2 = insertelement <16 x i8> %tmp1, i8 %A, i32 1
+	%tmp3 = insertelement <16 x i8> %tmp2, i8 %A, i32 2
+	%tmp4 = insertelement <16 x i8> %tmp3, i8 %A, i32 3
+	%tmp5 = insertelement <16 x i8> %tmp4, i8 %A, i32 4
+	%tmp6 = insertelement <16 x i8> %tmp5, i8 %A, i32 5
+	%tmp7 = insertelement <16 x i8> %tmp6, i8 %A, i32 6
+	%tmp8 = insertelement <16 x i8> %tmp7, i8 %A, i32 7
+	%tmp9 = insertelement <16 x i8> %tmp8, i8 %A, i32 8
+	%tmp10 = insertelement <16 x i8> %tmp9, i8 %A, i32 9
+	%tmp11 = insertelement <16 x i8> %tmp10, i8 %A, i32 10
+	%tmp12 = insertelement <16 x i8> %tmp11, i8 %A, i32 11
+	%tmp13 = insertelement <16 x i8> %tmp12, i8 %A, i32 12
+	%tmp14 = insertelement <16 x i8> %tmp13, i8 %A, i32 13
+	%tmp15 = insertelement <16 x i8> %tmp14, i8 %A, i32 14
+	%tmp16 = insertelement <16 x i8> %tmp15, i8 %A, i32 15
+	ret <16 x i8> %tmp16
+}
+
+define <8 x i16> @v_dupQ16(i16 %A) nounwind {
+;CHECK-LABEL: v_dupQ16:
+;CHECK: dup.8h
+	%tmp1 = insertelement <8 x i16> zeroinitializer, i16 %A, i32 0
+	%tmp2 = insertelement <8 x i16> %tmp1, i16 %A, i32 1
+	%tmp3 = insertelement <8 x i16> %tmp2, i16 %A, i32 2
+	%tmp4 = insertelement <8 x i16> %tmp3, i16 %A, i32 3
+	%tmp5 = insertelement <8 x i16> %tmp4, i16 %A, i32 4
+	%tmp6 = insertelement <8 x i16> %tmp5, i16 %A, i32 5
+	%tmp7 = insertelement <8 x i16> %tmp6, i16 %A, i32 6
+	%tmp8 = insertelement <8 x i16> %tmp7, i16 %A, i32 7
+	ret <8 x i16> %tmp8
+}
+
+define <4 x i32> @v_dupQ32(i32 %A) nounwind {
+;CHECK-LABEL: v_dupQ32:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x i32> zeroinitializer, i32 %A, i32 0
+	%tmp2 = insertelement <4 x i32> %tmp1, i32 %A, i32 1
+	%tmp3 = insertelement <4 x i32> %tmp2, i32 %A, i32 2
+	%tmp4 = insertelement <4 x i32> %tmp3, i32 %A, i32 3
+	ret <4 x i32> %tmp4
+}
+
+define <4 x float> @v_dupQfloat(float %A) nounwind {
+;CHECK-LABEL: v_dupQfloat:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x float> zeroinitializer, float %A, i32 0
+	%tmp2 = insertelement <4 x float> %tmp1, float %A, i32 1
+	%tmp3 = insertelement <4 x float> %tmp2, float %A, i32 2
+	%tmp4 = insertelement <4 x float> %tmp3, float %A, i32 3
+	ret <4 x float> %tmp4
+}
+
+; Check to make sure it works with shuffles, too.
+
+define <8 x i8> @v_shuffledup8(i8 %A) nounwind {
+;CHECK-LABEL: v_shuffledup8:
+;CHECK: dup.8b
+	%tmp1 = insertelement <8 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> zeroinitializer
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @v_shuffledup16(i16 %A) nounwind {
+;CHECK-LABEL: v_shuffledup16:
+;CHECK: dup.4h
+	%tmp1 = insertelement <4 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> zeroinitializer
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @v_shuffledup32(i32 %A) nounwind {
+;CHECK-LABEL: v_shuffledup32:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> zeroinitializer
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @v_shuffledupfloat(float %A) nounwind {
+;CHECK-LABEL: v_shuffledupfloat:
+;CHECK: dup.2s
+	%tmp1 = insertelement <2 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @v_shuffledupQ8(i8 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ8:
+;CHECK: dup.16b
+	%tmp1 = insertelement <16 x i8> undef, i8 %A, i32 0
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> zeroinitializer
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @v_shuffledupQ16(i16 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ16:
+;CHECK: dup.8h
+	%tmp1 = insertelement <8 x i16> undef, i16 %A, i32 0
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> zeroinitializer
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @v_shuffledupQ32(i32 %A) nounwind {
+;CHECK-LABEL: v_shuffledupQ32:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x i32> undef, i32 %A, i32 0
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> zeroinitializer
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @v_shuffledupQfloat(float %A) nounwind {
+;CHECK-LABEL: v_shuffledupQfloat:
+;CHECK: dup.4s
+	%tmp1 = insertelement <4 x float> undef, float %A, i32 0
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+	ret <4 x float> %tmp2
+}
+
+define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: vduplane8:
+;CHECK: dup.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: vduplane16:
+;CHECK: dup.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: vduplane32:
+;CHECK: dup.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind {
+;CHECK-LABEL: vduplanefloat:
+;CHECK: dup.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 >
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ8:
+;CHECK: dup.16b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ16:
+;CHECK: dup.8h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: vduplaneQ32:
+;CHECK: dup.4s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind {
+;CHECK-LABEL: vduplaneQfloat:
+;CHECK: dup.4s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 >
+	ret <4 x float> %tmp2
+}
+
+define <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: foo:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x i64> %0
+}
+
+define <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: bar:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x i64> %0
+}
+
+define <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: baz:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  ret <2 x double> %0
+}
+
+define <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone {
+;CHECK-LABEL: qux:
+;CHECK: dup.2d
+entry:
+  %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0>
+  ret <2 x double> %0
+}
+
+define <2 x i32> @f(i32 %a, i32 %b) nounwind readnone  {
+; CHECK-LABEL: f:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ret
+  %vecinit = insertelement <2 x i32> undef, i32 %a, i32 0
+  %vecinit1 = insertelement <2 x i32> %vecinit, i32 %b, i32 1
+  ret <2 x i32> %vecinit1
+}
+
+define <4 x i32> @g(i32 %a, i32 %b) nounwind readnone  {
+; CHECK-LABEL: g:
+; CHECK-NEXT: fmov s0, w0
+; CHECK-NEXT: ins.s v0[1], w1
+; CHECK-NEXT: ins.s v0[2], w1
+; CHECK-NEXT: ins.s v0[3], w0
+; CHECK-NEXT: ret
+  %vecinit = insertelement <4 x i32> undef, i32 %a, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 %b, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 %b, i32 2
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a, i32 3
+  ret <4 x i32> %vecinit3
+}
+
+define <2 x i64> @h(i64 %a, i64 %b) nounwind readnone  {
+; CHECK-LABEL: h:
+; CHECK-NEXT: fmov d0, x0
+; CHECK-NEXT: ins.d v0[1], x1
+; CHECK-NEXT: ret
+  %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0
+  %vecinit1 = insertelement <2 x i64> %vecinit, i64 %b, i32 1
+  ret <2 x i64> %vecinit1
+}
+
+; We used to spot this as a BUILD_VECTOR implementable by dup, but assume that
+; the single value needed was of the same type as the vector. This is false if
+; the scalar corresponding to the vector type is illegal (e.g. a <4 x i16>
+; BUILD_VECTOR will have an i32 as its source). In that case, the operation is
+; not a simple "dup vD.4h, vN.h[idx]" after all, and we crashed.
+define <4 x i16> @test_build_illegal(<4 x i32> %in) {
+; CHECK-LABEL: test_build_illegal:
+; CHECK: umov.s [[WTMP:w[0-9]+]], v0[3]
+; CHECK: dup.4h v0, [[WTMP]]
+  %val = extractelement <4 x i32> %in, i32 3
+  %smallval = trunc i32 %val to i16
+  %vec = insertelement <4x i16> undef, i16 %smallval, i32 3
+
+  ret <4 x i16> %vec
+}
+
+; We used to inherit an already extract_subvectored v4i16 from
+; SelectionDAGBuilder here. We then added a DUPLANE on top of that, preventing
+; the formation of an indexed-by-7 MLS.
+define <4 x i16> @test_high_splat(<4 x i16> %a, <4 x i16> %b, <8 x i16> %v) #0 {
+; CHECK-LABEL: test_high_splat:
+; CHECK: mls.4h v0, v1, v2[7]
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <4 x i32> <i32 7, i32 7, i32 7, i32 7>
+  %mul = mul <4 x i16> %shuffle, %b
+  %sub = sub <4 x i16> %a, %mul
+  ret <4 x i16> %sub
+}
diff --git a/test/CodeGen/ARM64/early-ifcvt.ll b/test/CodeGen/ARM64/early-ifcvt.ll
new file mode 100644
index 0000000000..a5c1e26c61
--- /dev/null
+++ b/test/CodeGen/ARM64/early-ifcvt.ll
@@ -0,0 +1,423 @@
+; RUN: llc < %s -stress-early-ifcvt | FileCheck %s
+target triple = "arm64-apple-macosx"
+
+; CHECK: mm2
+define i32 @mm2(i32* nocapture %p, i32 %n) nounwind uwtable readonly ssp {
+entry:
+  br label %do.body
+
+; CHECK: do.body
+; Loop body has no branches before the backedge.
+; CHECK-NOT: LBB
+do.body:
+  %max.0 = phi i32 [ 0, %entry ], [ %max.1, %do.cond ]
+  %min.0 = phi i32 [ 0, %entry ], [ %min.1, %do.cond ]
+  %n.addr.0 = phi i32 [ %n, %entry ], [ %dec, %do.cond ]
+  %p.addr.0 = phi i32* [ %p, %entry ], [ %incdec.ptr, %do.cond ]
+  %incdec.ptr = getelementptr inbounds i32* %p.addr.0, i64 1
+  %0 = load i32* %p.addr.0, align 4
+  %cmp = icmp sgt i32 %0, %max.0
+  br i1 %cmp, label %do.cond, label %if.else
+
+if.else:
+  %cmp1 = icmp slt i32 %0, %min.0
+  %.min.0 = select i1 %cmp1, i32 %0, i32 %min.0
+  br label %do.cond
+
+do.cond:
+  %max.1 = phi i32 [ %0, %do.body ], [ %max.0, %if.else ]
+  %min.1 = phi i32 [ %min.0, %do.body ], [ %.min.0, %if.else ]
+; CHECK: cbnz
+  %dec = add i32 %n.addr.0, -1
+  %tobool = icmp eq i32 %dec, 0
+  br i1 %tobool, label %do.end, label %do.body
+
+do.end:
+  %sub = sub nsw i32 %max.1, %min.1
+  ret i32 %sub
+}
+
+; CHECK-LABEL: fold_inc_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinc w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_inc_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inc = add nsw i32 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %inc, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inc_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinc x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_inc_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inc = add nsw i64 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %inc, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inc_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinc w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_inc_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inc = add nsw i32 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %inc, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inc_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinc x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_inc_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inc = add nsw i64 %x, 1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %inc, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inv_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinv w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_inv_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inv = xor i32 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %inv, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inv_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinv x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_inv_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inv = xor i64 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %inv, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_inv_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csinv w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_inv_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %inv = xor i32 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %inv, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_inv_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csinv x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_inv_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %inv = xor i64 %x, -1
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %inv, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_neg_true_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csneg w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @fold_neg_true_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %neg = sub nsw i32 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %y, %eq_bb ], [ %neg, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_neg_true_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csneg x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @fold_neg_true_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %neg = sub nsw i64 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %y, %eq_bb ], [ %neg, %entry ]
+  ret i64 %cond
+}
+
+; CHECK-LABEL: fold_neg_false_32:
+; CHECK: {{subs.*wzr,|cmp}} w2, #1
+; CHECK-NEXT: csneg w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @fold_neg_false_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 1
+  %neg = sub nsw i32 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %neg, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK-LABEL: fold_neg_false_64:
+; CHECK: {{subs.*xzr,|cmp}} x2, #1
+; CHECK-NEXT: csneg x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @fold_neg_false_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 1
+  %neg = sub nsw i64 0, %x
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %neg, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: cbnz_32
+; CHECK: {{subs.*wzr,|cmp}} w2, #0
+; CHECK-NEXT: csel w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @cbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i32 %c, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: cbnz_64
+; CHECK: {{subs.*xzr,|cmp}} x2, #0
+; CHECK-NEXT: csel x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @cbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp eq i64 %c, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: cbz_32
+; CHECK: {{subs.*wzr,|cmp}} w2, #0
+; CHECK-NEXT: csel w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @cbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %tobool = icmp ne i32 %c, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: cbz_64
+; CHECK: {{subs.*xzr,|cmp}} x2, #0
+; CHECK-NEXT: csel x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @cbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %tobool = icmp ne i64 %c, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: tbnz_32
+; CHECK: {{ands.*xzr,|tst}} x2, #0x80
+; CHECK-NEXT: csel w0, w1, w0, ne
+; CHECK-NEXT: ret
+define i32 @tbnz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %mask = and i32 %c, 128
+  %tobool = icmp eq i32 %mask, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: tbnz_64
+; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
+; CHECK-NEXT: csel x0, x1, x0, ne
+; CHECK-NEXT: ret
+define i64 @tbnz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %mask = and i64 %c, 9223372036854775808
+  %tobool = icmp eq i64 %mask, 0
+  br i1 %tobool, label %eq_bb, label %done
+
+eq_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %eq_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; CHECK: tbz_32
+; CHECK: {{ands.*xzr,|tst}} x2, #0x80
+; CHECK-NEXT: csel w0, w1, w0, eq
+; CHECK-NEXT: ret
+define i32 @tbz_32(i32 %x, i32 %y, i32 %c) nounwind ssp {
+entry:
+  %mask = and i32 %c, 128
+  %tobool = icmp ne i32 %mask, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i32 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i32 %cond
+}
+
+; CHECK: tbz_64
+; CHECK: {{ands.*xzr,|tst}} x2, #0x8000000000000000
+; CHECK-NEXT: csel x0, x1, x0, eq
+; CHECK-NEXT: ret
+define i64 @tbz_64(i64 %x, i64 %y, i64 %c) nounwind ssp {
+entry:
+  %mask = and i64 %c, 9223372036854775808
+  %tobool = icmp ne i64 %mask, 0
+  br i1 %tobool, label %ne_bb, label %done
+
+ne_bb:
+  br label %done
+
+done:
+  %cond = phi i64 [ %x, %ne_bb ], [ %y, %entry ]
+  ret i64 %cond
+}
+
+; This function from 175.vpr folds an ADDWri into a CSINC.
+; Remember to clear the kill flag on the ADDWri.
+define i32 @get_ytrack_to_xtracks() nounwind ssp {
+entry:
+  br label %for.body
+
+for.body:
+  %x0 = load i32* undef, align 4
+  br i1 undef, label %if.then.i146, label %is_sbox.exit155
+
+if.then.i146:
+  %add8.i143 = add nsw i32 0, %x0
+  %rem.i144 = srem i32 %add8.i143, %x0
+  %add9.i145 = add i32 %rem.i144, 1
+  br label %is_sbox.exit155
+
+is_sbox.exit155:                                  ; preds = %if.then.i146, %for.body
+  %seg_offset.0.i151 = phi i32 [ %add9.i145, %if.then.i146 ], [ undef, %for.body ]
+  %idxprom15.i152 = sext i32 %seg_offset.0.i151 to i64
+  %arrayidx18.i154 = getelementptr inbounds i32* null, i64 %idxprom15.i152
+  %x1 = load i32* %arrayidx18.i154, align 4
+  br i1 undef, label %for.body51, label %for.body
+
+for.body51:                                       ; preds = %is_sbox.exit155
+  call fastcc void @get_switch_type(i32 %x1, i32 undef, i16 signext undef, i16 signext undef, i16* undef)
+  unreachable
+}
+declare fastcc void @get_switch_type(i32, i32, i16 signext, i16 signext, i16* nocapture) nounwind ssp
diff --git a/test/CodeGen/ARM64/elf-calls.ll b/test/CodeGen/ARM64/elf-calls.ll
new file mode 100644
index 0000000000..8c4020327b
--- /dev/null
+++ b/test/CodeGen/ARM64/elf-calls.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj -o - %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ
+
+declare void @callee()
+
+define void @caller() {
+  call void @callee()
+  ret void
+; CHECK-LABEL: caller:
+; CHECK:     bl callee
+; CHECK-OBJ: R_AARCH64_CALL26 callee
+}
+
+define void @tail_caller() {
+  tail call void @callee()
+  ret void
+; CHECK-LABEL: tail_caller:
+; CHECK:     b callee
+; CHECK-OBJ: R_AARCH64_JUMP26 callee
+}
diff --git a/test/CodeGen/ARM64/elf-constpool.ll b/test/CodeGen/ARM64/elf-constpool.ll
new file mode 100644
index 0000000000..95d334376b
--- /dev/null
+++ b/test/CodeGen/ARM64/elf-constpool.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -O0 -o - %s | FileCheck %s
+
+; O0 checked for fastisel purposes. It has a separate path which
+; creates a constpool entry for floating values.
+
+define double @needs_const() {
+  ret double 3.14159
+; CHECK: .LCPI0_0:
+
+; CHECK: adrp {{x[0-9]+}}, .LCPI0_0
+; CHECK: ldr d0, [{{x[0-9]+}}, :lo12:.LCPI0_0]
+}
diff --git a/test/CodeGen/ARM64/elf-globals.ll b/test/CodeGen/ARM64/elf-globals.ll
new file mode 100644
index 0000000000..598c96ae48
--- /dev/null
+++ b/test/CodeGen/ARM64/elf-globals.ll
@@ -0,0 +1,115 @@
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 | FileCheck %s --check-prefix=CHECK-FAST
+; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s | FileCheck %s --check-prefix=CHECK-FAST-PIC
+
+@var8 = external global i8, align 1
+@var16 = external global i16, align 2
+@var32 = external global i32, align 4
+@var64 = external global i64, align 8
+
+define i8 @test_i8(i8 %new) {
+  %val = load i8* @var8, align 1
+  store i8 %new, i8* @var8
+  ret i8 %val
+; CHECK-LABEL: test_i8:
+; CHECK: adrp x[[HIREG:[0-9]+]], var8
+; CHECK: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+; CHECK: strb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+
+; CHECK-PIC-LABEL: test_i8:
+; CHECK-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
+; CHECK-PIC: ldr x[[VAR_ADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
+; CHECK-PIC: ldrb {{w[0-9]+}}, [x[[VAR_ADDR]]]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var8
+; CHECK-FAST: ldrb {{w[0-9]+}}, [x[[HIREG]], :lo12:var8]
+
+; CHECK-FAST-PIC: adrp x[[HIREG:[0-9]+]], :got:var8
+; CHECK-FAST-PIC: ldr x[[VARADDR:[0-9]+]], [x[[HIREG]], :got_lo12:var8]
+; CHECK-FAST-PIC: ldr {{w[0-9]+}}, [x[[VARADDR]]]
+}
+
+define i16 @test_i16(i16 %new) {
+  %val = load i16* @var16, align 2
+  store i16 %new, i16* @var16
+  ret i16 %val
+; CHECK-LABEL: test_i16:
+; CHECK: adrp x[[HIREG:[0-9]+]], var16
+; CHECK: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+; CHECK: strh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var16
+; CHECK-FAST: ldrh {{w[0-9]+}}, [x[[HIREG]], :lo12:var16]
+}
+
+define i32 @test_i32(i32 %new) {
+  %val = load i32* @var32, align 4
+  store i32 %new, i32* @var32
+  ret i32 %val
+; CHECK-LABEL: test_i32:
+; CHECK: adrp x[[HIREG:[0-9]+]], var32
+; CHECK: ldr {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
+; CHECK: str {{w[0-9]+}}, [x[[HIREG]], :lo12:var32]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var32
+; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var32
+}
+
+define i64 @test_i64(i64 %new) {
+  %val = load i64* @var64, align 8
+  store i64 %new, i64* @var64
+  ret i64 %val
+; CHECK-LABEL: test_i64:
+; CHECK: adrp x[[HIREG:[0-9]+]], var64
+; CHECK: ldr {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
+; CHECK: str {{x[0-9]+}}, [x[[HIREG]], :lo12:var64]
+
+; CHECK-FAST: adrp x[[HIREG:[0-9]+]], var64
+; CHECK-FAST: add {{x[0-9]+}}, x[[HIREG]], :lo12:var64
+}
+
+define i64* @test_addr() {
+  ret i64* @var64
+; CHECK-LABEL: test_addr:
+; CHECK: adrp [[HIREG:x[0-9]+]], var64
+; CHECK: add x0, [[HIREG]], :lo12:var64
+
+; CHECK-FAST: adrp [[HIREG:x[0-9]+]], var64
+; CHECK-FAST: add x0, [[HIREG]], :lo12:var64
+}
+
+@hiddenvar = hidden global i32 0, align 4
+@protectedvar = protected global i32 0, align 4
+
+define i32 @test_vis() {
+  %lhs = load i32* @hiddenvar, align 4
+  %rhs = load i32* @protectedvar, align 4
+  %ret = add i32 %lhs, %rhs
+  ret i32 %ret
+; CHECK-PIC: adrp {{x[0-9]+}}, hiddenvar
+; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:hiddenvar]
+; CHECK-PIC: adrp {{x[0-9]+}}, protectedvar
+; CHECK-PIC: ldr {{w[0-9]+}}, [{{x[0-9]+}}, :lo12:protectedvar]
+}
+
+@var_default = external global [2 x i32]
+
+define i32 @test_default_align() {
+  %addr = getelementptr [2 x i32]* @var_default, i32 0, i32 0
+  %val = load i32* %addr
+  ret i32 %val
+; CHECK-LABEL: test_default_align:
+; CHECK: adrp x[[HIREG:[0-9]+]], var_default
+; CHECK: ldr w0, [x[[HIREG]], :lo12:var_default]
+}
+
+define i64 @test_default_unaligned() {
+  %addr = bitcast [2 x i32]* @var_default to i64*
+  %val = load i64* %addr
+  ret i64 %val
+; CHECK-LABEL: test_default_unaligned:
+; CHECK: adrp [[HIREG:x[0-9]+]], var_default
+; CHECK: add x[[ADDR:[0-9]+]], [[HIREG]], :lo12:var_default
+; CHECK: ldr x0, [x[[ADDR]]]
+}
diff --git a/test/CodeGen/ARM64/ext.ll b/test/CodeGen/ARM64/ext.ll
new file mode 100644
index 0000000000..57d6e0c67b
--- /dev/null
+++ b/test/CodeGen/ARM64/ext.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @test_vextd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd:
+;CHECK: {{ext.8b.*#3}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+	ret <8 x i8> %tmp3
+}
+
+define <8 x i8> @test_vextRd(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRd:
+;CHECK: {{ext.8b.*#5}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4>
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextq:
+;CHECK: {{ext.16b.*3}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+	ret <16 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextRq(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRq:
+;CHECK: {{ext.16b.*7}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6>
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @test_vextd16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: test_vextd16:
+;CHECK: {{ext.8b.*#6}}
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+	ret <4 x i16> %tmp3
+}
+
+define <4 x i32> @test_vextq32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: test_vextq32:
+;CHECK: {{ext.16b.*12}}
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+	ret <4 x i32> %tmp3
+}
+
+; Undef shuffle indices should not prevent matching to VEXT:
+
+define <8 x i8> @test_vextd_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextd_undef:
+;CHECK: {{ext.8b.*}}
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 3, i32 undef, i32 undef, i32 6, i32 7, i32 8, i32 9, i32 10>
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @test_vextRq_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: test_vextRq_undef:
+;CHECK: {{ext.16b.*#7}}
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 23, i32 24, i32 25, i32 26, i32 undef, i32 undef, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 undef, i32 6>
+	ret <16 x i8> %tmp3
+}
+
+; Tests for ReconstructShuffle function. Indices have to be carefully
+; chosen to reach lowering phase as a BUILD_VECTOR.
+
+; One vector needs vext, the other can be handled by extract_subvector
+; Also checks interleaving of sources is handled correctly.
+; Essence: a vext is used on %A and something saner than stack load/store for final result.
+define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: test_interleaved:
+;CHECK: ext.8b
+;CHECK: zip1.4h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 3, i32 8, i32 5, i32 9>
+        ret <4 x i16> %tmp3
+}
+
+; An undef in the shuffle list should still be optimizable
+define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: test_undef:
+;CHECK: zip1.4h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 undef, i32 8, i32 5, i32 9>
+        ret <4 x i16> %tmp3
+}
diff --git a/test/CodeGen/ARM64/extend-int-to-fp.ll b/test/CodeGen/ARM64/extend-int-to-fp.ll
new file mode 100644
index 0000000000..599a697a31
--- /dev/null
+++ b/test/CodeGen/ARM64/extend-int-to-fp.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <4 x float> @foo(<4 x i16> %a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: ushll.4s	v0, v0, #0
+; CHECK-NEXT: ucvtf.4s	v0, v0
+; CHECK-NEXT: ret
+  %vcvt.i = uitofp <4 x i16> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @bar(<4 x i16> %a) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: sshll.4s	v0, v0, #0
+; CHECK-NEXT: scvtf.4s	v0, v0
+; CHECK-NEXT: ret
+  %vcvt.i = sitofp <4 x i16> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
diff --git a/test/CodeGen/ARM64/extend.ll b/test/CodeGen/ARM64/extend.ll
new file mode 100644
index 0000000000..4d20543671
--- /dev/null
+++ b/test/CodeGen/ARM64/extend.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+@array = external global [0 x i32]
+
+define i64 @foo(i32 %i) {
+; CHECK: foo
+; CHECK:  adrp  x[[REG:[0-9]+]], _array@GOTPAGE
+; CHECK:  ldr x[[REG1:[0-9]+]], [x[[REG]], _array@GOTPAGEOFF]
+; CHECK:  ldrsw x0, [x[[REG1]], x0, sxtw #2]
+; CHECK:  ret
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds [0 x i32]* @array, i64 0, i64 %idxprom
+  %tmp1 = load i32* %arrayidx, align 4
+  %conv = sext i32 %tmp1 to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/ARM64/extload-knownzero.ll b/test/CodeGen/ARM64/extload-knownzero.ll
new file mode 100644
index 0000000000..14e5fd310d
--- /dev/null
+++ b/test/CodeGen/ARM64/extload-knownzero.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+; rdar://12771555
+
+define void @foo(i16* %ptr, i32 %a) nounwind {
+entry:
+; CHECK-LABEL: foo:
+  %tmp1 = icmp ult i32 %a, 100
+  br i1 %tmp1, label %bb1, label %bb2
+bb1:
+; CHECK: %bb1
+; CHECK: ldrh [[REG:w[0-9]+]]
+  %tmp2 = load i16* %ptr, align 2
+  br label %bb2
+bb2:
+; CHECK: %bb2
+; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
+; CHECK: cmp [[REG]], #23
+  %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ]
+  %cmp = icmp ult i16 %tmp3, 24
+  br i1 %cmp, label %bb3, label %exit
+bb3:
+  call void @bar() nounwind
+  br label %exit
+exit:
+  ret void
+}
+
+declare void @bar ()
diff --git a/test/CodeGen/ARM64/extract.ll b/test/CodeGen/ARM64/extract.ll
new file mode 100644
index 0000000000..119751c99e
--- /dev/null
+++ b/test/CodeGen/ARM64/extract.ll
@@ -0,0 +1,58 @@
+; RUN: llc -arm64-extr-generation=true -verify-machineinstrs < %s \
+; RUN: -march=arm64 | FileCheck %s
+
+define i64 @ror_i64(i64 %in) {
+; CHECK-LABEL: ror_i64:
+    %left = shl i64 %in, 19
+    %right = lshr i64 %in, 45
+    %val5 = or i64 %left, %right
+; CHECK: extr {{x[0-9]+}}, x0, x0, #45
+    ret i64 %val5
+}
+
+define i32 @ror_i32(i32 %in) {
+; CHECK-LABEL: ror_i32:
+    %left = shl i32 %in, 9
+    %right = lshr i32 %in, 23
+    %val5 = or i32 %left, %right
+; CHECK: extr {{w[0-9]+}}, w0, w0, #23
+    ret i32 %val5
+}
+
+define i32 @extr_i32(i32 %lhs, i32 %rhs) {
+; CHECK-LABEL: extr_i32:
+  %left = shl i32 %lhs, 6
+  %right = lshr i32 %rhs, 26
+  %val = or i32 %left, %right
+  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+  ; something other than w0 and w1.
+; CHECK: extr {{w[0-9]+}}, w0, w1, #26
+
+  ret i32 %val
+}
+
+define i64 @extr_i64(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: extr_i64:
+  %right = lshr i64 %rhs, 40
+  %left = shl i64 %lhs, 24
+  %val = or i64 %right, %left
+  ; Order of lhs and rhs matters here. Regalloc would have to be very odd to use
+  ; something other than w0 and w1.
+; CHECK: extr {{x[0-9]+}}, x0, x1, #40
+
+  ret i64 %val
+}
+
+; Regression test: a bad experimental pattern crept into git which optimised
+; this pattern to a single EXTR.
+define i32 @extr_regress(i32 %a, i32 %b) {
+; CHECK-LABEL: extr_regress:
+
+    %sh1 = shl i32 %a, 14
+    %sh2 = lshr i32 %b, 14
+    %val = or i32 %sh2, %sh1
+; CHECK-NOT: extr {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, #{{[0-9]+}}
+
+    ret i32 %val
+; CHECK: ret
+}
diff --git a/test/CodeGen/ARM64/extract_subvector.ll b/test/CodeGen/ARM64/extract_subvector.ll
new file mode 100644
index 0000000000..20c05fb232
--- /dev/null
+++ b/test/CodeGen/ARM64/extract_subvector.ll
@@ -0,0 +1,51 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+; Extract of an upper half of a vector is an "ext.16b v0, v0, v0, #8" insn.
+
+define <8 x i8> @v8i8(<16 x i8> %a) nounwind {
+; CHECK: v8i8
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <16 x i8> %a, <16 x i8> %a, <8 x i32>  <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <8 x i8> %ret
+}
+
+define <4 x i16> @v4i16(<8 x i16> %a) nounwind {
+; CHECK-LABEL: v4i16:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <8 x i16> %a, <8 x i16> %a, <4 x i32>  <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i16> %ret
+}
+
+define <2 x i32> @v2i32(<4 x i32> %a) nounwind {
+; CHECK-LABEL: v2i32:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <4 x i32> %a, <4 x i32> %a, <2 x i32>  <i32 2, i32 3>
+  ret <2 x i32> %ret
+}
+
+define <1 x i64> @v1i64(<2 x i64> %a) nounwind {
+; CHECK-LABEL: v1i64:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <2 x i64> %a, <2 x i64> %a, <1 x i32>  <i32 1>
+  ret <1 x i64> %ret
+}
+
+define <2 x float> @v2f32(<4 x float> %a) nounwind {
+; CHECK-LABEL: v2f32:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <4 x float> %a, <4 x float> %a, <2 x i32>  <i32 2, i32 3>
+  ret <2 x float> %ret
+}
+
+define <1 x double> @v1f64(<2 x double> %a) nounwind {
+; CHECK-LABEL: v1f64:
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: ret
+  %ret = shufflevector <2 x double> %a, <2 x double> %a, <1 x i32>  <i32 1>
+  ret <1 x double> %ret
+}
diff --git a/test/CodeGen/ARM64/fast-isel-addr-offset.ll b/test/CodeGen/ARM64/fast-isel-addr-offset.ll
new file mode 100644
index 0000000000..a4326dc2b8
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-addr-offset.ll
@@ -0,0 +1,47 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+@sortlist = common global [5001 x i32] zeroinitializer, align 16
+@sortlist2 = common global [5001 x i64] zeroinitializer, align 16
+
+; Load an address with an offset larget then LDR imm can handle
+define i32 @foo() nounwind {
+entry:
+; CHECK: @foo
+; CHECK: adrp x[[REG:[0-9]+]], _sortlist@GOTPAGE
+; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist@GOTPAGEOFF]
+; CHECK: movz x[[REG2:[0-9]+]], #20000
+; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
+; CHECK: ldr w0, [x[[REG3]]]
+; CHECK: ret
+  %0 = load i32* getelementptr inbounds ([5001 x i32]* @sortlist, i32 0, i64 5000), align 4
+  ret i32 %0
+}
+
+define i64 @foo2() nounwind {
+entry:
+; CHECK: @foo2
+; CHECK: adrp x[[REG:[0-9]+]], _sortlist2@GOTPAGE
+; CHECK: ldr x[[REG1:[0-9]+]], [x[[REG]], _sortlist2@GOTPAGEOFF]
+; CHECK: movz x[[REG2:[0-9]+]], #40000
+; CHECK: add x[[REG3:[0-9]+]], x[[REG1]], x[[REG2]]
+; CHECK: ldr x0, [x[[REG3]]]
+; CHECK: ret
+  %0 = load i64* getelementptr inbounds ([5001 x i64]* @sortlist2, i32 0, i64 5000), align 4
+  ret i64 %0
+}
+
+; Load an address with a ridiculously large offset.
+; rdar://12505553
+@pd2 = common global i8* null, align 8
+
+define signext i8 @foo3() nounwind ssp {
+entry:
+; CHECK: @foo3
+; CHECK: movz x[[REG:[0-9]+]], #2874, lsl #32
+; CHECK: movk x[[REG]], #29646, lsl #16
+; CHECK: movk x[[REG]], #12274
+  %0 = load i8** @pd2, align 8
+  %arrayidx = getelementptr inbounds i8* %0, i64 12345678901234
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
diff --git a/test/CodeGen/ARM64/fast-isel-alloca.ll b/test/CodeGen/ARM64/fast-isel-alloca.ll
new file mode 100644
index 0000000000..8bbee16232
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-alloca.ll
@@ -0,0 +1,24 @@
+; This test should cause the TargetMaterializeAlloca to be invoked
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+%struct.S1Ty = type { i64 }
+%struct.S2Ty = type { %struct.S1Ty, %struct.S1Ty }
+
+define void @takeS1(%struct.S1Ty* %V) nounwind {
+entry:
+  %V.addr = alloca %struct.S1Ty*, align 8
+  store %struct.S1Ty* %V, %struct.S1Ty** %V.addr, align 8
+  ret void
+}
+
+define void @main() nounwind {
+entry:
+; CHECK: main
+; CHECK: mov x[[REG:[0-9]+]], sp
+; CHECK-NEXT: orr x[[REG1:[0-9]+]], xzr, #0x8
+; CHECK-NEXT: add x0, x[[REG]], x[[REG1]]
+  %E = alloca %struct.S2Ty, align 4
+  %B = getelementptr inbounds %struct.S2Ty* %E, i32 0, i32 1
+  call void @takeS1(%struct.S1Ty* %B)
+  ret void
+}
diff --git a/test/CodeGen/ARM64/fast-isel-br.ll b/test/CodeGen/ARM64/fast-isel-br.ll
new file mode 100644
index 0000000000..8fd32fdd35
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-br.ll
@@ -0,0 +1,155 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define void @branch1() nounwind uwtable ssp {
+  %x = alloca i32, align 4
+  store i32 0, i32* %x, align 4
+  %1 = load i32* %x, align 4
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %3, label %4
+
+; <label>:3                                       ; preds = %0
+  br label %4
+
+; <label>:4                                       ; preds = %3, %0
+  ret void
+}
+
+define void @branch2() nounwind uwtable ssp {
+  %1 = alloca i32, align 4
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  store i32 0, i32* %1
+  store i32 1, i32* %y, align 4
+  store i32 1, i32* %x, align 4
+  store i32 0, i32* %z, align 4
+  %2 = load i32* %x, align 4
+  %3 = icmp ne i32 %2, 0
+  br i1 %3, label %4, label %5
+
+; <label>:4                                       ; preds = %0
+  store i32 0, i32* %1
+  br label %14
+
+; <label>:5                                       ; preds = %0
+  %6 = load i32* %y, align 4
+  %7 = icmp ne i32 %6, 0
+  br i1 %7, label %8, label %13
+
+; <label>:8                                       ; preds = %5
+  %9 = load i32* %z, align 4
+  %10 = icmp ne i32 %9, 0
+  br i1 %10, label %11, label %12
+
+; <label>:11                                      ; preds = %8
+  store i32 1, i32* %1
+  br label %14
+
+; <label>:12                                      ; preds = %8
+  store i32 0, i32* %1
+  br label %14
+
+; <label>:13                                      ; preds = %5
+  br label %14
+
+; <label>:14                                      ; preds = %4, %11, %12, %13
+  %15 = load i32* %1
+  ret void
+}
+
+define void @true_() nounwind uwtable ssp {
+; CHECK: @true_
+; CHECK: b LBB2_1
+  br i1 true, label %1, label %2
+
+; <label>:1
+; CHECK: LBB2_1
+  br label %2
+
+; <label>:2
+  ret void
+}
+
+define void @false_() nounwind uwtable ssp {
+; CHECK: @false_
+; CHECK: b LBB3_2
+  br i1 false, label %1, label %2
+
+; <label>:1
+  br label %2
+
+; <label>:2
+; CHECK: LBB3_2
+  ret void
+}
+
+define zeroext i8 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) {
+entry:
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %0 = load i16* %b.addr, align 2
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: b.eq LBB4_2
+  %conv = trunc i16 %0 to i1
+  br i1 %conv, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  call void @foo1()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %1 = load i32* %c.addr, align 4
+; CHECK: and w[[REG:[0-9]+]], w{{[0-9]+}}, #0x1
+; CHECK: subs w{{[0-9]+}}, w[[REG]], #0
+; CHECK: b.eq LBB4_4
+  %conv1 = trunc i32 %1 to i1
+  br i1 %conv1, label %if.then3, label %if.end4
+
+if.then3:                                         ; preds = %if.end
+  call void @foo1()
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.then3, %if.end
+  %2 = load i64* %d.addr, align 8
+; CHECK: subs w{{[0-9]+}}, w{{[0-9]+}}, #0
+; CHECK: b.eq LBB4_6
+  %conv5 = trunc i64 %2 to i1
+  br i1 %conv5, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.end4
+  call void @foo1()
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.then7, %if.end4
+  %3 = load i8* %a.addr, align 1
+  ret i8 %3
+}
+
+declare void @foo1()
+
+; rdar://15174028
+define i32 @trunc64(i64 %foo) nounwind {
+; CHECK: trunc64
+; CHECK: orr  [[REG:x[0-9]+]], xzr, #0x1
+; CHECK: and  [[REG2:x[0-9]+]], x0, [[REG]]
+; CHECK: mov  x[[REG3:[0-9]+]], [[REG2]]
+; CHECK: and  [[REG4:w[0-9]+]], w[[REG3]], #0x1
+; CHECK: subs {{w[0-9]+}}, [[REG4]], #0
+; CHECK: b.eq LBB5_2
+  %a = and i64 %foo, 1
+  %b = trunc i64 %a to i1
+  br i1 %b, label %if.then, label %if.else
+
+if.then:
+  ret i32 1
+
+if.else:
+  ret i32 0
+}
diff --git a/test/CodeGen/ARM64/fast-isel-call.ll b/test/CodeGen/ARM64/fast-isel-call.ll
new file mode 100644
index 0000000000..be0ca688da
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-call.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define void @call0() nounwind {
+entry:
+  ret void
+}
+
+define void @foo0() nounwind {
+entry:
+; CHECK: foo0
+; CHECK: bl _call0
+  call void @call0()
+  ret void
+}
+
+define i32 @call1(i32 %a) nounwind {
+entry:
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  ret i32 %tmp
+}
+
+define i32 @foo1(i32 %a) nounwind {
+entry:
+; CHECK: foo1
+; CHECK: stur w0, [fp, #-4]
+; CHECK-NEXT: ldur w0, [fp, #-4]
+; CHECK-NEXT: bl _call1
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %call = call i32 @call1(i32 %tmp)
+  ret i32 %call
+}
+
+define i32 @sext_(i8 %a, i16 %b) nounwind {
+entry:
+; CHECK: @sext_
+; CHECK: sxtb w0, w0
+; CHECK: sxth w1, w1
+; CHECK: bl _foo_sext_
+  call void @foo_sext_(i8 signext %a, i16 signext %b)
+  ret i32 0
+}
+
+declare void @foo_sext_(i8 %a, i16 %b)
+
+define i32 @zext_(i8 %a, i16 %b) nounwind {
+entry:
+; CHECK: @zext_
+; CHECK: uxtb w0, w0
+; CHECK: uxth w1, w1
+  call void @foo_zext_(i8 zeroext %a, i16 zeroext %b)
+  ret i32 0
+}
+
+declare void @foo_zext_(i8 %a, i16 %b)
+
+define i32 @t1(i32 %argc, i8** nocapture %argv) {
+entry:
+; CHECK: @t1
+; The last parameter will be passed on stack via i8.
+; CHECK: strb w{{[0-9]+}}, [sp]
+; CHECK-NEXT: bl _bar
+  %call = call i32 @bar(i8 zeroext 0, i8 zeroext -8, i8 zeroext -69, i8 zeroext 28, i8 zeroext 40, i8 zeroext -70, i8 zeroext 28, i8 zeroext 39, i8 zeroext -41)
+  ret i32 0
+}
+
+declare i32 @bar(i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext, i8 zeroext)
+
+; Test materialization of integers.  Target-independent selector handles this.
+define i32 @t2() {
+entry:
+; CHECK: @t2
+; CHECK: movz x0, #0
+; CHECK: orr w1, wzr, #0xfffffff8
+; CHECK: orr w[[REG:[0-9]+]], wzr, #0x3ff
+; CHECK: orr w[[REG2:[0-9]+]], wzr, #0x2
+; CHECK: movz w[[REG3:[0-9]+]], #0
+; CHECK: orr w[[REG4:[0-9]+]], wzr, #0x1
+; CHECK: uxth w2, w[[REG]]
+; CHECK: sxtb w3, w[[REG2]]
+; CHECK: and w4, w[[REG3]], #0x1
+; CHECK: and w5, w[[REG4]], #0x1
+; CHECK: bl	_func2
+  %call = call i32 @func2(i64 zeroext 0, i32 signext -8, i16 zeroext 1023, i8 signext -254, i1 zeroext 0, i1 zeroext 1)
+  ret i32 0
+}
+
+declare i32 @func2(i64 zeroext, i32 signext, i16 zeroext, i8 signext, i1 zeroext, i1 zeroext)
diff --git a/test/CodeGen/ARM64/fast-isel-conversion.ll b/test/CodeGen/ARM64/fast-isel-conversion.ll
new file mode 100644
index 0000000000..4e62e332eb
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-conversion.ll
@@ -0,0 +1,416 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+;; Test various conversions.
+define zeroext i32 @trunc_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: trunc_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldr x3, [sp]
+; CHECK: mov x0, x3
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: strb w0, [sp, #15]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: uxtb w0, w0
+; CHECK: add sp, sp, #16
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i64* %d.addr, align 8
+  %conv = trunc i64 %tmp to i32
+  store i32 %conv, i32* %c.addr, align 4
+  %tmp1 = load i32* %c.addr, align 4
+  %conv2 = trunc i32 %tmp1 to i16
+  store i16 %conv2, i16* %b.addr, align 2
+  %tmp3 = load i16* %b.addr, align 2
+  %conv4 = trunc i16 %tmp3 to i8
+  store i8 %conv4, i8* %a.addr, align 1
+  %tmp5 = load i8* %a.addr, align 1
+  %conv6 = zext i8 %tmp5 to i32
+  ret i32 %conv6
+}
+
+define i64 @zext_(i8 zeroext %a, i16 zeroext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: zext_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: uxtb w0, w0
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: uxth w0, w0
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: uxtw x3, w0
+; CHECK: str x3, [sp]
+; CHECK: ldr x0, [sp], #16
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i8* %a.addr, align 1
+  %conv = zext i8 %tmp to i16
+  store i16 %conv, i16* %b.addr, align 2
+  %tmp1 = load i16* %b.addr, align 2
+  %conv2 = zext i16 %tmp1 to i32
+  store i32 %conv2, i32* %c.addr, align 4
+  %tmp3 = load i32* %c.addr, align 4
+  %conv4 = zext i32 %tmp3 to i64
+  store i64 %conv4, i64* %d.addr, align 8
+  %tmp5 = load i64* %d.addr, align 8
+  ret i64 %tmp5
+}
+
+define i32 @zext_i1_i32(i1 zeroext %a) nounwind ssp {
+entry:
+; CHECK: @zext_i1_i32
+; CHECK: and w0, w0, #0x1
+  %conv = zext i1 %a to i32
+  ret i32 %conv;
+}
+
+define i64 @zext_i1_i64(i1 zeroext %a) nounwind ssp {
+entry:
+; CHECK: @zext_i1_i64
+; CHECK: and w0, w0, #0x1
+  %conv = zext i1 %a to i64
+  ret i64 %conv;
+}
+
+define i64 @sext_(i8 signext %a, i16 signext %b, i32 %c, i64 %d) nounwind ssp {
+entry:
+; CHECK: sext_
+; CHECK: sub sp, sp, #16
+; CHECK: strb w0, [sp, #15]
+; CHECK: strh w1, [sp, #12]
+; CHECK: str w2, [sp, #8]
+; CHECK: str x3, [sp]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: sxtb w0, w0
+; CHECK: strh w0, [sp, #12]
+; CHECK: ldrh w0, [sp, #12]
+; CHECK: sxth w0, w0
+; CHECK: str w0, [sp, #8]
+; CHECK: ldr w0, [sp, #8]
+; CHECK: sxtw x3, w0
+; CHECK: str x3, [sp]
+; CHECK: ldr x0, [sp], #16
+; CHECK: ret
+  %a.addr = alloca i8, align 1
+  %b.addr = alloca i16, align 2
+  %c.addr = alloca i32, align 4
+  %d.addr = alloca i64, align 8
+  store i8 %a, i8* %a.addr, align 1
+  store i16 %b, i16* %b.addr, align 2
+  store i32 %c, i32* %c.addr, align 4
+  store i64 %d, i64* %d.addr, align 8
+  %tmp = load i8* %a.addr, align 1
+  %conv = sext i8 %tmp to i16
+  store i16 %conv, i16* %b.addr, align 2
+  %tmp1 = load i16* %b.addr, align 2
+  %conv2 = sext i16 %tmp1 to i32
+  store i32 %conv2, i32* %c.addr, align 4
+  %tmp3 = load i32* %c.addr, align 4
+  %conv4 = sext i32 %tmp3 to i64
+  store i64 %conv4, i64* %d.addr, align 8
+  %tmp5 = load i64* %d.addr, align 8
+  ret i64 %tmp5
+}
+
+; Test sext i8 to i64
+define i64 @sext_2(i8 signext %a) nounwind ssp {
+entry:
+; CHECK: sext_2
+; CHECK: sxtb x0, w0
+  %conv = sext i8 %a to i64
+  ret i64 %conv
+}
+
+; Test sext i1 to i32
+define i32 @sext_i1_i32(i1 signext %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i32
+; CHECK: sbfm w0, w0, #0, #0
+  %conv = sext i1 %a to i32
+  ret i32 %conv
+}
+
+; Test sext i1 to i16
+define signext i16 @sext_i1_i16(i1 %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i16
+; CHECK: sbfm w0, w0, #0, #0
+  %conv = sext i1 %a to i16
+  ret i16 %conv
+}
+
+; Test sext i1 to i8
+define signext i8 @sext_i1_i8(i1 %a) nounwind ssp {
+entry:
+; CHECK: sext_i1_i8
+; CHECK: sbfm w0, w0, #0, #0
+  %conv = sext i1 %a to i8
+  ret i8 %conv
+}
+
+; Test fpext
+define double @fpext_(float %a) nounwind ssp {
+entry:
+; CHECK: fpext_
+; CHECK: fcvt d0, s0
+  %conv = fpext float %a to double
+  ret double %conv
+}
+
+; Test fptrunc
+define float @fptrunc_(double %a) nounwind ssp {
+entry:
+; CHECK: fptrunc_
+; CHECK: fcvt s0, d0
+  %conv = fptrunc double %a to float
+  ret float %conv
+}
+
+; Test fptosi
+define i32 @fptosi_ws(float %a) nounwind ssp {
+entry:
+; CHECK: fptosi_ws
+; CHECK: fcvtzs w0, s0
+  %conv = fptosi float %a to i32
+  ret i32 %conv
+}
+
+; Test fptosi
+define i32 @fptosi_wd(double %a) nounwind ssp {
+entry:
+; CHECK: fptosi_wd
+; CHECK: fcvtzs w0, d0
+  %conv = fptosi double %a to i32
+  ret i32 %conv
+}
+
+; Test fptoui
+define i32 @fptoui_ws(float %a) nounwind ssp {
+entry:
+; CHECK: fptoui_ws
+; CHECK: fcvtzu w0, s0
+  %conv = fptoui float %a to i32
+  ret i32 %conv
+}
+
+; Test fptoui
+define i32 @fptoui_wd(double %a) nounwind ssp {
+entry:
+; CHECK: fptoui_wd
+; CHECK: fcvtzu w0, d0
+  %conv = fptoui double %a to i32
+  ret i32 %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i1(i1 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i1
+; CHECK: sbfm w0, w0, #0, #0
+; CHECK: scvtf s0, w0
+  %conv = sitofp i1 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i8(i8 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i8
+; CHECK: sxtb w0, w0
+; CHECK: scvtf s0, w0
+  %conv = sitofp i8 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw_i16(i16 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw_i16
+; CHECK: sxth w0, w0
+; CHECK: scvtf s0, w0
+  %conv = sitofp i16 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sw(i32 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sw
+; CHECK: scvtf s0, w0
+  %conv = sitofp i32 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define float @sitofp_sx(i64 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_sx
+; CHECK: scvtf s0, x0
+  %conv = sitofp i64 %a to float
+  ret float %conv
+}
+
+; Test sitofp
+define double @sitofp_dw(i32 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_dw
+; CHECK: scvtf d0, w0
+  %conv = sitofp i32 %a to double
+  ret double %conv
+}
+
+; Test sitofp
+define double @sitofp_dx(i64 %a) nounwind ssp {
+entry:
+; CHECK: sitofp_dx
+; CHECK: scvtf d0, x0
+  %conv = sitofp i64 %a to double
+  ret double %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i1(i1 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i1
+; CHECK: and w0, w0, #0x1
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i1 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i8(i8 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i8
+; CHECK: uxtb w0, w0
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i8 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw_i16(i16 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw_i16
+; CHECK: uxth w0, w0
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i16 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sw(i32 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sw
+; CHECK: ucvtf s0, w0
+  %conv = uitofp i32 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define float @uitofp_sx(i64 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_sx
+; CHECK: ucvtf s0, x0
+  %conv = uitofp i64 %a to float
+  ret float %conv
+}
+
+; Test uitofp
+define double @uitofp_dw(i32 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_dw
+; CHECK: ucvtf d0, w0
+  %conv = uitofp i32 %a to double
+  ret double %conv
+}
+
+; Test uitofp
+define double @uitofp_dx(i64 %a) nounwind ssp {
+entry:
+; CHECK: uitofp_dx
+; CHECK: ucvtf d0, x0
+  %conv = uitofp i64 %a to double
+  ret double %conv
+}
+
+define i32 @i64_trunc_i32(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i32
+; CHECK: mov x1, x0
+  %conv = trunc i64 %a to i32
+  ret i32 %conv
+}
+
+define zeroext i16 @i64_trunc_i16(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i16
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xffff
+; CHECK: uxth w0, [[REG2]]
+  %conv = trunc i64 %a to i16
+  ret i16 %conv
+}
+
+define zeroext i8 @i64_trunc_i8(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i8
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0xff
+; CHECK: uxtb w0, [[REG2]]
+  %conv = trunc i64 %a to i8
+  ret i8 %conv
+}
+
+define zeroext i1 @i64_trunc_i1(i64 %a) nounwind ssp {
+entry:
+; CHECK: i64_trunc_i1
+; CHECK: mov x[[REG:[0-9]+]], x0
+; CHECK: and [[REG2:w[0-9]+]], w[[REG]], #0x1
+; CHECK: and w0, [[REG2]], #0x1
+  %conv = trunc i64 %a to i1
+  ret i1 %conv
+}
+
+; rdar://15101939
+define void @stack_trunc() nounwind {
+; CHECK: stack_trunc
+; CHECK: sub  sp, sp, #16
+; CHECK: ldr  [[REG:x[0-9]+]], [sp]
+; CHECK: mov  x[[REG2:[0-9]+]], [[REG]]
+; CHECK: and  [[REG3:w[0-9]+]], w[[REG2]], #0xff
+; CHECK: strb [[REG3]], [sp, #15]
+; CHECK: add  sp, sp, #16
+  %a = alloca i8, align 1
+  %b = alloca i64, align 8
+  %c = load i64* %b, align 8
+  %d = trunc i64 %c to i8
+  store i8 %d, i8* %a, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM64/fast-isel-fcmp.ll b/test/CodeGen/ARM64/fast-isel-fcmp.ll
new file mode 100644
index 0000000000..cf71fab714
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-fcmp.ll
@@ -0,0 +1,146 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define zeroext i1 @fcmp_float1(float %a) nounwind ssp {
+entry:
+; CHECK: @fcmp_float1
+; CHECK: fcmp s0, #0.0
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une float %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_float2(float %a, float %b) nounwind ssp {
+entry:
+; CHECK: @fcmp_float2
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une float %a, %b
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_double1(double %a) nounwind ssp {
+entry:
+; CHECK: @fcmp_double1
+; CHECK: fcmp d0, #0.0
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une double %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define zeroext i1 @fcmp_double2(double %a, double %b) nounwind ssp {
+entry:
+; CHECK: @fcmp_double2
+; CHECK: fcmp d0, d1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une double %a, %b
+  ret i1 %cmp
+}
+
+; Check each fcmp condition
+define float @fcmp_oeq(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_oeq
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ne
+  %cmp = fcmp oeq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ogt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ogt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, le
+  %cmp = fcmp ogt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_oge(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_oge
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, lt
+  %cmp = fcmp oge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_olt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_olt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, pl
+  %cmp = fcmp olt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ole(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ole
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, hi
+  %cmp = fcmp ole float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ord(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ord
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, vs
+  %cmp = fcmp ord float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uno(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_uno
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, vc
+  %cmp = fcmp uno float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ugt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ugt
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ls
+  %cmp = fcmp ugt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uge(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_uge
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, mi
+  %cmp = fcmp uge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ult(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ult
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, ge
+  %cmp = fcmp ult float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ule(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ule
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, gt
+  %cmp = fcmp ule float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_une(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_une
+; CHECK: fcmp s0, s1
+; CHECK: csinc {{w[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
diff --git a/test/CodeGen/ARM64/fast-isel-gv.ll b/test/CodeGen/ARM64/fast-isel-gv.ll
new file mode 100644
index 0000000000..cb3df1412c
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-gv.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Test load/store of global value from global offset table.
+@seed = common global i64 0, align 8
+
+define void @Initrand() nounwind {
+entry:
+; CHECK: @Initrand
+; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
+; CHECK: str x{{[0-9]+}}, [x[[REG2]]]
+  store i64 74755, i64* @seed, align 8
+  ret void
+}
+
+define i32 @Rand() nounwind {
+entry:
+; CHECK: @Rand
+; CHECK: adrp x[[REG:[0-9]+]], _seed@GOTPAGE
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]], _seed@GOTPAGEOFF]
+; CHECK: movz x[[REG3:[0-9]+]], #1309
+; CHECK: ldr x[[REG4:[0-9]+]], [x[[REG2]]]
+; CHECK: mul x[[REG5:[0-9]+]], x[[REG4]], x[[REG3]]
+; CHECK: movz x[[REG6:[0-9]+]], #13849
+; CHECK: add x[[REG7:[0-9]+]], x[[REG5]], x[[REG6]]
+; CHECK: orr x[[REG8:[0-9]+]], xzr, #0xffff
+; CHECK: and x[[REG9:[0-9]+]], x[[REG7]], x[[REG8]]
+; CHECK: str x[[REG9]], [x[[REG]]]
+; CHECK: ldr x{{[0-9]+}}, [x[[REG]]]
+  %0 = load i64* @seed, align 8
+  %mul = mul nsw i64 %0, 1309
+  %add = add nsw i64 %mul, 13849
+  %and = and i64 %add, 65535
+  store i64 %and, i64* @seed, align 8
+  %1 = load i64* @seed, align 8
+  %conv = trunc i64 %1 to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/ARM64/fast-isel-icmp.ll b/test/CodeGen/ARM64/fast-isel-icmp.ll
new file mode 100644
index 0000000000..22af5428d9
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-icmp.ll
@@ -0,0 +1,214 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @icmp_eq_imm(i32 %a) nounwind ssp {
+entry:
+; CHECK: icmp_eq_imm
+; CHECK: cmp  w0, #31
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i32 %a, 31
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_eq_neg_imm(i32 %a) nounwind ssp {
+entry:
+; CHECK: icmp_eq_neg_imm
+; CHECK: cmn  w0, #7
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i32 %a, -7
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_eq(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ne(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ne
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = icmp ne i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ugt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ugt
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ls
+  %cmp = icmp ugt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_uge(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_uge
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, cc
+  %cmp = icmp uge i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ult(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ult
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, cs
+  %cmp = icmp ult i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_ule(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_ule
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, hi
+  %cmp = icmp ule i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sgt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sgt
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, le
+  %cmp = icmp sgt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sge(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sge
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, lt
+  %cmp = icmp sge i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_slt(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_slt
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ge
+  %cmp = icmp slt i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_sle(i32 %a, i32 %b) nounwind ssp {
+entry:
+; CHECK: icmp_sle
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, gt
+  %cmp = icmp sle i32 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define i32 @icmp_i64(i64 %a, i64 %b) nounwind ssp {
+entry:
+; CHECK: icmp_i64
+; CHECK: cmp  x0, x1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, gt
+  %cmp = icmp sle i64 %a, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+define zeroext i1 @icmp_eq_i16(i16 %a, i16 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq_i16
+; CHECK: sxth w0, w0
+; CHECK: sxth w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i16 %a, %b
+  ret i1 %cmp
+}
+
+define zeroext i1 @icmp_eq_i8(i8 %a, i8 %b) nounwind ssp {
+entry:
+; CHECK: icmp_eq_i8
+; CHECK: sxtb w0, w0
+; CHECK: sxtb w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, ne
+  %cmp = icmp eq i8 %a, %b
+  ret i1 %cmp
+}
+
+define i32 @icmp_i16_unsigned(i16 %a, i16 %b) nounwind {
+entry:
+; CHECK: icmp_i16_unsigned
+; CHECK: uxth w0, w0
+; CHECK: uxth w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, cs
+  %cmp = icmp ult i16 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i8_signed(i8 %a, i8 %b) nounwind {
+entry:
+; CHECK: @icmp_i8_signed
+; CHECK: sxtb w0, w0
+; CHECK: sxtb w1, w1
+; CHECK: cmp  w0, w1
+; CHECK: csinc w0, wzr, wzr, le
+  %cmp = icmp sgt i8 %a, %b
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+
+define i32 @icmp_i16_signed_const(i16 %a) nounwind {
+entry:
+; CHECK: icmp_i16_signed_const
+; CHECK: sxth w0, w0
+; CHECK: cmn  w0, #233
+; CHECK: csinc w0, wzr, wzr, ge
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp slt i16 %a, -233
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i8_signed_const(i8 %a) nounwind {
+entry:
+; CHECK: icmp_i8_signed_const
+; CHECK: sxtb w0, w0
+; CHECK: cmp  w0, #124
+; CHECK: csinc w0, wzr, wzr, le
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp sgt i8 %a, 124
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
+
+define i32 @icmp_i1_unsigned_const(i1 %a) nounwind {
+entry:
+; CHECK: icmp_i1_unsigned_const
+; CHECK: and w0, w0, #0x1
+; CHECK: cmp  w0, #0
+; CHECK: csinc w0, wzr, wzr, cs
+; CHECK: and w0, w0, #0x1
+  %cmp = icmp ult i1 %a, 0
+  %conv2 = zext i1 %cmp to i32
+  ret i32 %conv2
+}
diff --git a/test/CodeGen/ARM64/fast-isel-indirectbr.ll b/test/CodeGen/ARM64/fast-isel-indirectbr.ll
new file mode 100644
index 0000000000..70335ace50
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-indirectbr.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+@fn.table = internal global [2 x i8*] [i8* blockaddress(@fn, %ZERO), i8* blockaddress(@fn, %ONE)], align 8
+
+define i32 @fn(i32 %target) nounwind {
+entry:
+; CHECK: @fn
+  %retval = alloca i32, align 4
+  %target.addr = alloca i32, align 4
+  store i32 %target, i32* %target.addr, align 4
+  %0 = load i32* %target.addr, align 4
+  %idxprom = zext i32 %0 to i64
+  %arrayidx = getelementptr inbounds [2 x i8*]* @fn.table, i32 0, i64 %idxprom
+  %1 = load i8** %arrayidx, align 8
+  br label %indirectgoto
+
+ZERO:                                             ; preds = %indirectgoto
+; CHECK: LBB0_1
+  store i32 0, i32* %retval
+  br label %return
+
+ONE:                                              ; preds = %indirectgoto
+; CHECK: LBB0_2
+  store i32 1, i32* %retval
+  br label %return
+
+return:                                           ; preds = %ONE, %ZERO
+  %2 = load i32* %retval
+  ret i32 %2
+
+indirectgoto:                                     ; preds = %entry
+; CHECK: ldr x0, [sp]
+; CHECK: br x0
+  %indirect.goto.dest = phi i8* [ %1, %entry ]
+  indirectbr i8* %indirect.goto.dest, [label %ZERO, label %ONE]
+}
diff --git a/test/CodeGen/ARM64/fast-isel-intrinsic.ll b/test/CodeGen/ARM64/fast-isel-intrinsic.ll
new file mode 100644
index 0000000000..6443d82e2c
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-intrinsic.ll
@@ -0,0 +1,135 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=arm64-apple-ios | FileCheck %s --check-prefix=ARM64
+
+@message = global [80 x i8] c"The LLVM Compiler Infrastructure\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00", align 16
+@temp = common global [80 x i8] zeroinitializer, align 16
+
+define void @t1() {
+; ARM64: t1
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x0, x8, _message@PAGEOFF
+; ARM64: movz w9, #0
+; ARM64: movz x2, #80
+; ARM64: uxtb w1, w9
+; ARM64: bl _memset
+  call void @llvm.memset.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i8 0, i64 80, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+define void @t2() {
+; ARM64: t2
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x1, x8, _message@PAGEOFF
+; ARM64: movz x2, #80
+; ARM64: bl _memcpy
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 80, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+
+define void @t3() {
+; ARM64: t3
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x0, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x8, _message@PAGE
+; ARM64: add x1, x8, _message@PAGEOFF
+; ARM64: movz x2, #20
+; ARM64: bl _memmove
+  call void @llvm.memmove.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 20, i32 16, i1 false)
+  ret void
+}
+
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
+
+define void @t4() {
+; ARM64: t4
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr x10, [x9]
+; ARM64: str x10, [x8]
+; ARM64: ldr x10, [x9, #8]
+; ARM64: str x10, [x8, #8]
+; ARM64: ldrb w11, [x9, #16]
+; ARM64: strb w11, [x8, #16]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 16, i1 false)
+  ret void
+}
+
+define void @t5() {
+; ARM64: t5
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr x10, [x9]
+; ARM64: str x10, [x8]
+; ARM64: ldr x10, [x9, #8]
+; ARM64: str x10, [x8, #8]
+; ARM64: ldrb w11, [x9, #16]
+; ARM64: strb w11, [x8, #16]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 17, i32 8, i1 false)
+  ret void
+}
+
+define void @t6() {
+; ARM64: t6
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldr w10, [x9]
+; ARM64: str w10, [x8]
+; ARM64: ldr w10, [x9, #4]
+; ARM64: str w10, [x8, #4]
+; ARM64: ldrb w10, [x9, #8]
+; ARM64: strb w10, [x8, #8]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 9, i32 4, i1 false)
+  ret void
+}
+
+define void @t7() {
+; ARM64: t7
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldrh w10, [x9]
+; ARM64: strh w10, [x8]
+; ARM64: ldrh w10, [x9, #2]
+; ARM64: strh w10, [x8, #2]
+; ARM64: ldrh w10, [x9, #4]
+; ARM64: strh w10, [x8, #4]
+; ARM64: ldrb w10, [x9, #6]
+; ARM64: strb w10, [x8, #6]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 7, i32 2, i1 false)
+  ret void
+}
+
+define void @t8() {
+; ARM64: t8
+; ARM64: adrp x8, _temp@GOTPAGE
+; ARM64: ldr x8, [x8, _temp@GOTPAGEOFF]
+; ARM64: adrp x9, _message@PAGE
+; ARM64: add x9, x9, _message@PAGEOFF
+; ARM64: ldrb w10, [x9]
+; ARM64: strb w10, [x8]
+; ARM64: ldrb w10, [x9, #1]
+; ARM64: strb w10, [x8, #1]
+; ARM64: ldrb w10, [x9, #2]
+; ARM64: strb w10, [x8, #2]
+; ARM64: ldrb w10, [x9, #3]
+; ARM64: strb w10, [x8, #3]
+; ARM64: ret
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([80 x i8]* @temp, i32 0, i32 0), i8* getelementptr inbounds ([80 x i8]* @message, i32 0, i32 0), i64 4, i32 1, i1 false)
+  ret void
+}
diff --git a/test/CodeGen/ARM64/fast-isel-materialize.ll b/test/CodeGen/ARM64/fast-isel-materialize.ll
new file mode 100644
index 0000000000..fa2daf73db
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-materialize.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+; Materialize using fmov
+define void @float_(float* %value) {
+; CHECK: @float_
+; CHECK: fmov s0, #1.250000e+00
+  store float 1.250000e+00, float* %value, align 4
+  ret void
+}
+
+define void @double_(double* %value) {
+; CHECK: @double_
+; CHECK: fmov d0, #1.250000e+00
+  store double 1.250000e+00, double* %value, align 8
+  ret void
+}
+
+; Materialize from constant pool
+define float @float_cp() {
+; CHECK: @float_cp
+  ret float 0x400921FB60000000
+}
+
+define double @double_cp() {
+; CHECK: @double_cp
+  ret double 0x400921FB54442D18
+}
diff --git a/test/CodeGen/ARM64/fast-isel-noconvert.ll b/test/CodeGen/ARM64/fast-isel-noconvert.ll
new file mode 100644
index 0000000000..3517970016
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-noconvert.ll
@@ -0,0 +1,36 @@
+; RUN: llc -mtriple=arm64-apple-ios -O0 %s -o - | FileCheck %s
+
+; Fast-isel can't do vector conversions yet, but it was emitting some highly
+; suspect UCVTFUWDri MachineInstrs.
+define <4 x float> @test_uitofp(<4 x i32> %in) {
+; CHECK-LABEL: test_uitofp:
+; CHECK: ucvtf.4s v0, v0
+
+  %res = uitofp <4 x i32> %in to <4 x float>
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_sitofp(<2 x i32> %in) {
+; CHECK-LABEL: test_sitofp:
+; CHECK: sshll.2d [[EXT:v[0-9]+]], v0, #0
+; CHECK: scvtf.2d v0, [[EXT]]
+
+  %res = sitofp <2 x i32> %in to <2 x double>
+  ret <2 x double> %res
+}
+
+define <2 x i32> @test_fptoui(<2 x float> %in) {
+; CHECK-LABEL: test_fptoui:
+; CHECK: fcvtzu.2s v0, v0
+
+  %res = fptoui <2 x float> %in to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <2 x i64> @test_fptosi(<2 x double> %in) {
+; CHECK-LABEL: test_fptosi:
+; CHECK: fcvtzs.2d v0, v0
+
+  %res = fptosi <2 x double> %in to <2 x i64>
+  ret <2 x i64> %res
+}
\ No newline at end of file
diff --git a/test/CodeGen/ARM64/fast-isel-rem.ll b/test/CodeGen/ARM64/fast-isel-rem.ll
new file mode 100644
index 0000000000..0c68401f5c
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-rem.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @t1(i32 %a, i32 %b) {
+; CHECK: @t1
+; CHECK: sdiv w2, w0, w1
+; CHECK: msub w2, w2, w1, w0
+  %1 = srem i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @t2(i64 %a, i64 %b) {
+; CHECK: @t2
+; CHECK: sdiv x2, x0, x1
+; CHECK: msub x2, x2, x1, x0
+  %1 = srem i64 %a, %b
+  ret i64 %1
+}
+
+define i32 @t3(i32 %a, i32 %b) {
+; CHECK: @t3
+; CHECK: udiv w2, w0, w1
+; CHECK: msub w2, w2, w1, w0
+  %1 = urem i32 %a, %b
+  ret i32 %1
+}
+
+define i64 @t4(i64 %a, i64 %b) {
+; CHECK: @t4
+; CHECK: udiv x2, x0, x1
+; CHECK: msub x2, x2, x1, x0
+  %1 = urem i64 %a, %b
+  ret i64 %1
+}
diff --git a/test/CodeGen/ARM64/fast-isel-ret.ll b/test/CodeGen/ARM64/fast-isel-ret.ll
new file mode 100644
index 0000000000..d91fd285d5
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-ret.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+;; Test returns.
+define void @t0() nounwind ssp {
+entry:
+; CHECK: t0
+; CHECK: ret
+  ret void
+}
+
+define i32 @t1(i32 %a) nounwind ssp {
+entry:
+; CHECK: t1
+; CHECK: str w0, [sp, #12]
+; CHECK-NEXT: ldr w0, [sp, #12]
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  ret i32 %tmp
+}
+
+define i64 @t2(i64 %a) nounwind ssp {
+entry:
+; CHECK: t2
+; CHECK: str x0, [sp, #8]
+; CHECK-NEXT: ldr x0, [sp, #8]
+; CHECK: ret
+  %a.addr = alloca i64, align 8
+  store i64 %a, i64* %a.addr, align 8
+  %tmp = load i64* %a.addr, align 8
+  ret i64 %tmp
+}
+
+define signext i16 @ret_i16(i16 signext %a) nounwind {
+entry:
+; CHECK: @ret_i16
+; CHECK: sxth	w0, w0
+  %a.addr = alloca i16, align 1
+  store i16 %a, i16* %a.addr, align 1
+  %0 = load i16* %a.addr, align 1
+  ret i16 %0
+}
+
+define signext i8 @ret_i8(i8 signext %a) nounwind {
+entry:
+; CHECK: @ret_i8
+; CHECK: sxtb	w0, w0
+  %a.addr = alloca i8, align 1
+  store i8 %a, i8* %a.addr, align 1
+  %0 = load i8* %a.addr, align 1
+  ret i8 %0
+}
+
+define signext i1 @ret_i1(i1 signext %a) nounwind {
+entry:
+; CHECK: @ret_i1
+; CHECK: and w0, w0, #0x1
+  %a.addr = alloca i1, align 1
+  store i1 %a, i1* %a.addr, align 1
+  %0 = load i1* %a.addr, align 1
+  ret i1 %0
+}
diff --git a/test/CodeGen/ARM64/fast-isel-select.ll b/test/CodeGen/ARM64/fast-isel-select.ll
new file mode 100644
index 0000000000..1cc207f591
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel-select.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i32 @t1(i32 %c) nounwind readnone {
+entry:
+; CHECK: @t1
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
+  %0 = icmp sgt i32 %c, 1
+  %1 = select i1 %0, i32 123, i32 357
+  ret i32 %1
+}
+
+define i64 @t2(i32 %c) nounwind readnone {
+entry:
+; CHECK: @t2
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
+  %0 = icmp sgt i32 %c, 1
+  %1 = select i1 %0, i64 123, i64 357
+  ret i64 %1
+}
+
+define i32 @t3(i1 %c, i32 %a, i32 %b) nounwind readnone {
+entry:
+; CHECK: @t3
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel w0, w{{[0-9]+}}, w{{[0-9]+}}, ne
+  %0 = select i1 %c, i32 %a, i32 %b
+  ret i32 %0
+}
+
+define i64 @t4(i1 %c, i64 %a, i64 %b) nounwind readnone {
+entry:
+; CHECK: @t4
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: csel x0, x{{[0-9]+}}, x{{[0-9]+}}, ne
+  %0 = select i1 %c, i64 %a, i64 %b
+  ret i64 %0
+}
+
+define float @t5(i1 %c, float %a, float %b) nounwind readnone {
+entry:
+; CHECK: @t5
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: fcsel s0, s0, s1, ne
+  %0 = select i1 %c, float %a, float %b
+  ret float %0
+}
+
+define double @t6(i1 %c, double %a, double %b) nounwind readnone {
+entry:
+; CHECK: @t6
+; CHECK: and w0, w0, #0x1
+; CHECK: subs w0, w0, #0
+; CHECK: fcsel d0, d0, d1, ne
+  %0 = select i1 %c, double %a, double %b
+  ret double %0
+}
diff --git a/test/CodeGen/ARM64/fast-isel.ll b/test/CodeGen/ARM64/fast-isel.ll
new file mode 100644
index 0000000000..ba718d3a95
--- /dev/null
+++ b/test/CodeGen/ARM64/fast-isel.ll
@@ -0,0 +1,95 @@
+; RUN: llc < %s -O0 -fast-isel-abort -mtriple=arm64-apple-darwin | FileCheck %s
+
+define void @t0(i32 %a) nounwind {
+entry:
+; CHECK: t0
+; CHECK: str {{w[0-9]+}}, [sp, #12]
+; CHECK-NEXT: ldr [[REGISTER:w[0-9]+]], [sp, #12]
+; CHECK-NEXT: str [[REGISTER]], [sp, #12]
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  store i32 %a, i32* %a.addr
+  %tmp = load i32* %a.addr
+  store i32 %tmp, i32* %a.addr
+  ret void
+}
+
+define void @t1(i64 %a) nounwind {
+; CHECK: t1
+; CHECK: str {{x[0-9]+}}, [sp, #8]
+; CHECK-NEXT: ldr [[REGISTER:x[0-9]+]], [sp, #8]
+; CHECK-NEXT: str [[REGISTER]], [sp, #8]
+; CHECK: ret
+  %a.addr = alloca i64, align 4
+  store i64 %a, i64* %a.addr
+  %tmp = load i64* %a.addr
+  store i64 %tmp, i64* %a.addr
+  ret void
+}
+
+define zeroext i1 @i1(i1 %a) nounwind {
+entry:
+; CHECK: @i1
+; CHECK: and w0, w0, #0x1
+; CHECK: strb w0, [sp, #15]
+; CHECK: ldrb w0, [sp, #15]
+; CHECK: and w0, w0, #0x1
+; CHECK: and w0, w0, #0x1
+; CHECK: add sp, sp, #16
+; CHECK: ret
+  %a.addr = alloca i1, align 1
+  store i1 %a, i1* %a.addr, align 1
+  %0 = load i1* %a.addr, align 1
+  ret i1 %0
+}
+
+define i32 @t2(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -1
+  %1 = load i32* %0, align 4
+  ret i32 %1
+}
+
+define i32 @t3(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldur w0, [x0, #-256]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -64
+  %1 = load i32* %0, align 4
+  ret i32 %1
+}
+
+define void @t4(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: movz w8, #0
+; CHECK: stur w8, [x0, #-4]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -1
+  store i32 0, i32* %0, align 4
+  ret void
+}
+
+define void @t5(i32 *%ptr) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: movz w8, #0
+; CHECK: stur w8, [x0, #-256]
+; CHECK: ret
+  %0 = getelementptr i32 *%ptr, i32 -64
+  store i32 0, i32* %0, align 4
+  ret void
+}
+
+define void @t6() nounwind {
+; CHECK: t6
+; CHECK: brk #1
+  tail call void @llvm.trap()
+  ret void
+}
+
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM64/fastcc-tailcall.ll b/test/CodeGen/ARM64/fastcc-tailcall.ll
new file mode 100644
index 0000000000..8a744c513d
--- /dev/null
+++ b/test/CodeGen/ARM64/fastcc-tailcall.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define void @caller(i32* nocapture %p, i32 %a, i32 %b) nounwind optsize ssp {
+; CHECK-NOT: stp
+; CHECK: b       {{_callee|callee}}
+; CHECK-NOT: ldp
+; CHECK: ret
+  %1 = icmp eq i32 %b, 0
+  br i1 %1, label %3, label %2
+
+  tail call fastcc void @callee(i32* %p, i32 %a) optsize
+  br label %3
+
+  ret void
+}
+
+define internal fastcc void @callee(i32* nocapture %p, i32 %a) nounwind optsize noinline ssp {
+  store volatile i32 %a, i32* %p, align 4, !tbaa !0
+  ret void
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll b/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
new file mode 100644
index 0000000000..af9fe05617
--- /dev/null
+++ b/test/CodeGen/ARM64/fastisel-gep-promote-before-add.ll
@@ -0,0 +1,18 @@
+; fastisel should not fold add with non-pointer bitwidth
+; sext(a) + sext(b) != sext(a + b)
+; RUN: llc -mtriple=arm64-apple-darwin %s -O0 -o - | FileCheck %s
+
+define zeroext i8 @gep_promotion(i8* %ptr) nounwind uwtable ssp {
+entry:
+  %ptr.addr = alloca i8*, align 8
+  %add = add i8 64, 64 ; 0x40 + 0x40
+  %0 = load i8** %ptr.addr, align 8
+
+  ; CHECK-LABEL: _gep_promotion:
+  ; CHECK: ldrb {{[a-z][0-9]+}}, {{\[[a-z][0-9]+\]}}
+  %arrayidx = getelementptr inbounds i8* %0, i8 %add
+
+  %1 = load i8* %arrayidx, align 1
+  ret i8 %1
+}
+
diff --git a/test/CodeGen/ARM64/fcmp-opt.ll b/test/CodeGen/ARM64/fcmp-opt.ll
new file mode 100644
index 0000000000..17412dde74
--- /dev/null
+++ b/test/CodeGen/ARM64/fcmp-opt.ll
@@ -0,0 +1,173 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+; rdar://10263824
+
+define i1 @fcmp_float1(float %a) nounwind ssp {
+entry:
+; CHECK: @fcmp_float1
+; CHECK: fcmp s0, #0.0
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = fcmp une float %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_float2(float %a, float %b) nounwind ssp {
+entry:
+; CHECK: @fcmp_float2
+; CHECK: fcmp s0, s1
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = fcmp une float %a, %b
+  ret i1 %cmp
+}
+
+define i1 @fcmp_double1(double %a) nounwind ssp {
+entry:
+; CHECK: @fcmp_double1
+; CHECK: fcmp d0, #0.0
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = fcmp une double %a, 0.000000e+00
+  ret i1 %cmp
+}
+
+define i1 @fcmp_double2(double %a, double %b) nounwind ssp {
+entry:
+; CHECK: @fcmp_double2
+; CHECK: fcmp d0, d1
+; CHECK: csinc w0, wzr, wzr, eq
+  %cmp = fcmp une double %a, %b
+  ret i1 %cmp
+}
+
+; Check each fcmp condition
+define float @fcmp_oeq(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_oeq
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ne
+  %cmp = fcmp oeq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ogt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ogt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, le
+  %cmp = fcmp ogt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_oge(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_oge
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, lt
+  %cmp = fcmp oge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_olt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_olt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, pl
+  %cmp = fcmp olt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ole(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ole
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, hi
+  %cmp = fcmp ole float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ord(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ord
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, vs
+  %cmp = fcmp ord float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uno(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_uno
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, vc
+  %cmp = fcmp uno float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ugt(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ugt
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ls
+  %cmp = fcmp ugt float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_uge(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_uge
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, mi
+  %cmp = fcmp uge float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ult(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ult
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, ge
+  %cmp = fcmp ult float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_ule(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ule
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, gt
+  %cmp = fcmp ule float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+define float @fcmp_une(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_une
+; CHECK: fcmp s0, s1
+; CHECK: csinc w{{[0-9]+}}, wzr, wzr, eq
+  %cmp = fcmp une float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+; Possible opportunity for improvement.  See comment in
+; ARM64TargetLowering::LowerSETCC()
+define float @fcmp_one(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_one
+;	fcmp	s0, s1
+;	orr	w0, wzr, #0x1
+;	csel	w1, w0, wzr, mi
+;	csel	w0, w0, wzr, gt
+  %cmp = fcmp one float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
+
+; Possible opportunity for improvement.  See comment in
+; ARM64TargetLowering::LowerSETCC()
+define float @fcmp_ueq(float %a, float %b) nounwind ssp {
+; CHECK: @fcmp_ueq
+; CHECK: fcmp s0, s1
+;        orr w0, wzr, #0x1
+; CHECK: csel [[REG1:w[0-9]]], [[REG2:w[0-9]+]], wzr, eq
+; CHECK: csel {{w[0-9]+}}, [[REG2]], [[REG1]], vs
+  %cmp = fcmp ueq float %a, %b
+  %conv = uitofp i1 %cmp to float
+  ret float %conv
+}
diff --git a/test/CodeGen/ARM64/fcopysign.ll b/test/CodeGen/ARM64/fcopysign.ll
new file mode 100644
index 0000000000..094ce7aa5b
--- /dev/null
+++ b/test/CodeGen/ARM64/fcopysign.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; rdar://9332258
+
+define float @test1(float %x, float %y) nounwind {
+entry:
+; CHECK-LABEL: test1:
+; CHECK: movi.4s	v2, #128, lsl #24
+; CHECK: bit.16b	v0, v1, v2
+  %0 = tail call float @copysignf(float %x, float %y) nounwind readnone
+  ret float %0
+}
+
+define double @test2(double %x, double %y) nounwind {
+entry:
+; CHECK-LABEL: test2:
+; CHECK: movi.2d	v2, #0
+; CHECK: fneg.2d	v2, v2
+; CHECK: bit.16b	v0, v1, v2
+  %0 = tail call double @copysign(double %x, double %y) nounwind readnone
+  ret double %0
+}
+
+; rdar://9545768
+define double @test3(double %a, float %b, float %c) nounwind {
+; CHECK-LABEL: test3:
+; CHECK: fcvt d1, s1
+; CHECK: fneg.2d v2, v{{[0-9]+}}
+; CHECK: bit.16b v0, v1, v2
+  %tmp1 = fadd float %b, %c
+  %tmp2 = fpext float %tmp1 to double
+  %tmp = tail call double @copysign( double %a, double %tmp2 ) nounwind readnone
+  ret double %tmp
+}
+
+define float @test4() nounwind {
+entry:
+; CHECK-LABEL: test4:
+; CHECK: fcvt s0, d0
+; CHECK: movi.4s v[[CONST:[0-9]+]], #128, lsl #24
+; CHECK: bit.16b v{{[0-9]+}}, v0, v[[CONST]]
+  %0 = tail call double (...)* @bar() nounwind
+  %1 = fptrunc double %0 to float
+  %2 = tail call float @copysignf(float 5.000000e-01, float %1) nounwind readnone
+  %3 = fadd float %1, %2
+  ret float %3
+}
+
+declare double @bar(...)
+declare double @copysign(double, double) nounwind readnone
+declare float @copysignf(float, float) nounwind readnone
diff --git a/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll b/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
new file mode 100644
index 0000000000..77981f292b
--- /dev/null
+++ b/test/CodeGen/ARM64/fixed-point-scalar-cvt-dagcombine.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+; DAGCombine to transform a conversion of an extract_vector_elt to an
+; extract_vector_elt of a conversion, which saves a round trip of copies
+; of the value to a GPR and back to and FPR.
+; rdar://11855286
+define double @foo0(<2 x i64> %a) nounwind {
+; CHECK:  scvtf.2d  [[REG:v[0-9]+]], v0, #9
+; CHECK-NEXT:  ins.d v0[0], [[REG]][1]
+  %vecext = extractelement <2 x i64> %a, i32 1
+  %fcvt_n = tail call double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64 %vecext, i32 9)
+  ret double %fcvt_n
+}
+
+declare double @llvm.arm64.neon.vcvtfxs2fp.f64.i64(i64, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/fmadd.ll b/test/CodeGen/ARM64/fmadd.ll
new file mode 100644
index 0000000000..4ea841b8a1
--- /dev/null
+++ b/test/CodeGen/ARM64/fmadd.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+define float @fma32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fma32:
+; CHECK: fmadd
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
+  ret float %0
+}
+
+define float @fnma32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fnma32:
+; CHECK: fnmadd
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %c)
+  %mul = fmul float %0, -1.000000e+00
+  ret float %mul
+}
+
+define float @fms32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fms32:
+; CHECK: fmsub
+  %mul = fmul float %b, -1.000000e+00
+  %0 = tail call float @llvm.fma.f32(float %a, float %mul, float %c)
+  ret float %0
+}
+
+define float @fnms32(float %a, float %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fnms32:
+; CHECK: fnmsub
+  %mul = fmul float %c, -1.000000e+00
+  %0 = tail call float @llvm.fma.f32(float %a, float %b, float %mul)
+  ret float %0
+}
+
+define double @fma64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fma64:
+; CHECK: fmadd
+entry:
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
+  ret double %0
+}
+
+define double @fnma64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnma64:
+; CHECK: fnmadd
+entry:
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %c)
+  %mul = fmul double %0, -1.000000e+00
+  ret double %mul
+}
+
+define double @fms64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fms64:
+; CHECK: fmsub
+entry:
+  %mul = fmul double %b, -1.000000e+00
+  %0 = tail call double @llvm.fma.f64(double %a, double %mul, double %c)
+  ret double %0
+}
+
+define double @fnms64(double %a, double %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fnms64:
+; CHECK: fnmsub
+entry:
+  %mul = fmul double %c, -1.000000e+00
+  %0 = tail call double @llvm.fma.f64(double %a, double %b, double %mul)
+  ret double %0
+}
+
+declare float @llvm.fma.f32(float, float, float) nounwind readnone
+declare double @llvm.fma.f64(double, double, double) nounwind readnone
diff --git a/test/CodeGen/ARM64/fmax.ll b/test/CodeGen/ARM64/fmax.ll
new file mode 100644
index 0000000000..53ecf86a02
--- /dev/null
+++ b/test/CodeGen/ARM64/fmax.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=arm64 -enable-no-nans-fp-math < %s | FileCheck %s
+
+define double @test_direct(float %in) #1 {
+entry:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %longer = fpext float %in to double
+  %val = select i1 %cmp, double 0.000000e+00, double %longer
+  ret double %val
+
+; CHECK: fmax
+}
+
+define double @test_cross(float %in) #1 {
+entry:
+  %cmp = fcmp olt float %in, 0.000000e+00
+  %longer = fpext float %in to double
+  %val = select i1 %cmp, double %longer, double 0.000000e+00
+  ret double %val
+
+; CHECK: fmin
+}
diff --git a/test/CodeGen/ARM64/fmuladd.ll b/test/CodeGen/ARM64/fmuladd.ll
new file mode 100644
index 0000000000..174d830767
--- /dev/null
+++ b/test/CodeGen/ARM64/fmuladd.ll
@@ -0,0 +1,88 @@
+; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define float @test_f32(float* %A, float* %B, float* %C) nounwind {
+;CHECK-LABEL: test_f32:
+;CHECK: fmadd
+;CHECK-NOT: fmadd
+  %tmp1 = load float* %A
+  %tmp2 = load float* %B
+  %tmp3 = load float* %C
+  %tmp4 = call float @llvm.fmuladd.f32(float %tmp1, float %tmp2, float %tmp3)
+  ret float %tmp4
+}
+
+define <2 x float> @test_v2f32(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: test_v2f32:
+;CHECK: fmla.2s
+;CHECK-NOT: fmla.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @test_v4f32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: test_v4f32:
+;CHECK: fmla.4s
+;CHECK-NOT: fmla.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
+  ret <4 x float> %tmp4
+}
+
+define <8 x float> @test_v8f32(<8 x float>* %A, <8 x float>* %B, <8 x float>* %C) nounwind {
+;CHECK-LABEL: test_v8f32:
+;CHECK: fmla.4s
+;CHECK: fmla.4s
+;CHECK-NOT: fmla.4s
+  %tmp1 = load <8 x float>* %A
+  %tmp2 = load <8 x float>* %B
+  %tmp3 = load <8 x float>* %C
+  %tmp4 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> %tmp1, <8 x float> %tmp2, <8 x float> %tmp3)
+  ret <8 x float> %tmp4
+}
+
+define double @test_f64(double* %A, double* %B, double* %C) nounwind {
+;CHECK-LABEL: test_f64:
+;CHECK: fmadd
+;CHECK-NOT: fmadd
+  %tmp1 = load double* %A
+  %tmp2 = load double* %B
+  %tmp3 = load double* %C
+  %tmp4 = call double @llvm.fmuladd.f64(double %tmp1, double %tmp2, double %tmp3)
+  ret double %tmp4
+}
+
+define <2 x double> @test_v2f64(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: test_v2f64:
+;CHECK: fmla.2d
+;CHECK-NOT: fmla.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
+  ret <2 x double> %tmp4
+}
+
+define <4 x double> @test_v4f64(<4 x double>* %A, <4 x double>* %B, <4 x double>* %C) nounwind {
+;CHECK-LABEL: test_v4f64:
+;CHECK: fmla.2d
+;CHECK: fmla.2d
+;CHECK-NOT: fmla.2d
+  %tmp1 = load <4 x double>* %A
+  %tmp2 = load <4 x double>* %B
+  %tmp3 = load <4 x double>* %C
+  %tmp4 = call <4 x double> @llvm.fmuladd.v4f64(<4 x double> %tmp1, <4 x double> %tmp2, <4 x double> %tmp3)
+  ret <4 x double> %tmp4
+}
+
+declare float @llvm.fmuladd.f32(float, float, float) nounwind readnone
+declare <2 x float> @llvm.fmuladd.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <8 x float> @llvm.fmuladd.v8f32(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
+declare double @llvm.fmuladd.f64(double, double, double) nounwind readnone
+declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+declare <4 x double> @llvm.fmuladd.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
diff --git a/test/CodeGen/ARM64/fold-address.ll b/test/CodeGen/ARM64/fold-address.ll
new file mode 100644
index 0000000000..96cc3e90f6
--- /dev/null
+++ b/test/CodeGen/ARM64/fold-address.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -O2 -mtriple=arm64-apple-darwin | FileCheck %s
+
+%0 = type opaque
+%struct.CGRect = type { %struct.CGPoint, %struct.CGSize }
+%struct.CGPoint = type { double, double }
+%struct.CGSize = type { double, double }
+
+@"OBJC_IVAR_$_UIScreen._bounds" = external hidden global i64, section "__DATA, __objc_ivar", align 8
+
+define hidden %struct.CGRect @nofold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: nofold:
+; CHECK: add x[[REG:[0-9]+]], x0, x{{[0-9]+}}
+; CHECK: ldp d0, d1, [x[[REG]]]
+; CHECK: ldp d2, d3, [x[[REG]], #16]
+; CHECK: ret
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr.sum = add i64 %ivar, 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr.sum17 = add i64 %ivar, 16
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %add.ptr.sum17
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.sum = add i64 %ivar, 24
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %add.ptr4.1.sum
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+define hidden %struct.CGRect @fold(%0* nocapture %self, i8* nocapture %_cmd) nounwind readonly optsize ssp {
+entry:
+; CHECK-LABEL: fold:
+; CHECK: ldr d0, [x0, x{{[0-9]+}}]
+; CHECK-NOT: add x0, x0, x1
+; CHECK: ret
+  %ivar = load i64* @"OBJC_IVAR_$_UIScreen._bounds", align 8, !invariant.load !4
+  %0 = bitcast %0* %self to i8*
+  %add.ptr = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr10.0 = bitcast i8* %add.ptr to double*
+  %tmp11 = load double* %add.ptr10.0, align 8
+  %add.ptr10.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %1 = bitcast i8* %add.ptr10.1 to double*
+  %tmp12 = load double* %1, align 8
+  %add.ptr4.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %add.ptr4.1.0 = bitcast i8* %add.ptr4.1 to double*
+  %tmp = load double* %add.ptr4.1.0, align 8
+  %add.ptr4.1.1 = getelementptr inbounds i8* %0, i64 %ivar
+  %2 = bitcast i8* %add.ptr4.1.1 to double*
+  %tmp5 = load double* %2, align 8
+  %insert14 = insertvalue %struct.CGPoint undef, double %tmp11, 0
+  %insert16 = insertvalue %struct.CGPoint %insert14, double %tmp12, 1
+  %insert = insertvalue %struct.CGRect undef, %struct.CGPoint %insert16, 0
+  %insert7 = insertvalue %struct.CGSize undef, double %tmp, 0
+  %insert9 = insertvalue %struct.CGSize %insert7, double %tmp5, 1
+  %insert3 = insertvalue %struct.CGRect %insert, %struct.CGSize %insert9, 1
+  ret %struct.CGRect %insert3
+}
+
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = metadata !{i32 1, metadata !"Objective-C Version", i32 2}
+!1 = metadata !{i32 1, metadata !"Objective-C Image Info Version", i32 0}
+!2 = metadata !{i32 1, metadata !"Objective-C Image Info Section", metadata !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!3 = metadata !{i32 4, metadata !"Objective-C Garbage Collection", i32 0}
+!4 = metadata !{}
diff --git a/test/CodeGen/ARM64/fold-lsl.ll b/test/CodeGen/ARM64/fold-lsl.ll
new file mode 100644
index 0000000000..a856c96b39
--- /dev/null
+++ b/test/CodeGen/ARM64/fold-lsl.ll
@@ -0,0 +1,79 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+;
+; <rdar://problem/14486451>
+
+%struct.a = type [256 x i16]
+%struct.b = type [256 x i32]
+%struct.c = type [256 x i64]
+
+define i16 @load_halfword(%struct.a* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_halfword:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ldrh w0, [x0, [[REG]], lsl #1]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
+  %result = load i16* %arrayidx86, align 2
+  ret i16 %result
+}
+
+define i32 @load_word(%struct.b* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_word:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ldr w0, [x0, [[REG]], lsl #2]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
+  %result = load i32* %arrayidx86, align 4
+  ret i32 %result
+}
+
+define i64 @load_doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: load_doubleword:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: ldr x0, [x0, [[REG]], lsl #3]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
+  %result = load i64* %arrayidx86, align 8
+  ret i64 %result
+}
+
+define void @store_halfword(%struct.a* %ctx, i32 %xor72, i16 %val) nounwind {
+; CHECK-LABEL: store_halfword:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: strh w2, [x0, [[REG]], lsl #1]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.a* %ctx, i64 0, i64 %idxprom83
+  store i16 %val, i16* %arrayidx86, align 8
+  ret void
+}
+
+define void @store_word(%struct.b* %ctx, i32 %xor72, i32 %val) nounwind {
+; CHECK-LABEL: store_word:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: str w2, [x0, [[REG]], lsl #2]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.b* %ctx, i64 0, i64 %idxprom83
+  store i32 %val, i32* %arrayidx86, align 8
+  ret void
+}
+
+define void @store_doubleword(%struct.c* %ctx, i32 %xor72, i64 %val) nounwind {
+; CHECK-LABEL: store_doubleword:
+; CHECK: ubfm [[REG:x[0-9]+]], x1, #9, #16
+; CHECK: str x2, [x0, [[REG]], lsl #3]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.c* %ctx, i64 0, i64 %idxprom83
+  store i64 %val, i64* %arrayidx86, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM64/fp-imm.ll b/test/CodeGen/ARM64/fp-imm.ll
new file mode 100644
index 0000000000..db16b65de1
--- /dev/null
+++ b/test/CodeGen/ARM64/fp-imm.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+; CHECK: literal8
+; CHECK: .quad  4614256656552045848
+define double @foo() {
+; CHECK: _foo:
+; CHECK: adrp x[[REG:[0-9]+]], lCPI0_0@PAGE
+; CHECK: ldr  d0, [x[[REG]], lCPI0_0@PAGEOFF]
+; CHECK-NEXT: ret
+  ret double 0x400921FB54442D18
+}
+
+; CHECK: literal4
+; CHECK: .long 1078530011
+define float @bar() {
+; CHECK: _bar:
+; CHECK:  adrp  x[[REG:[0-9]+]], lCPI1_0@PAGE
+; CHECK:  ldr s0, [x[[REG]], lCPI1_0@PAGEOFF]
+; CHECK-NEXT:  ret
+  ret float 0x400921FB60000000
+}
diff --git a/test/CodeGen/ARM64/fp.ll b/test/CodeGen/ARM64/fp.ll
new file mode 100644
index 0000000000..08b1b6754c
--- /dev/null
+++ b/test/CodeGen/ARM64/fp.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define float @t1(i1 %a, float %b, float %c) nounwind {
+; CHECK: t1
+; CHECK: fcsel	s0, s0, s1, ne
+  %sel = select i1 %a, float %b, float %c
+  ret float %sel
+}
diff --git a/test/CodeGen/ARM64/fp128-folding.ll b/test/CodeGen/ARM64/fp128-folding.ll
new file mode 100644
index 0000000000..6a7d203f5b
--- /dev/null
+++ b/test/CodeGen/ARM64/fp128-folding.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
+declare void @bar(i8*, i8*, i32*)
+
+; SelectionDAG used to try to fold some fp128 operations using the ppc128 type,
+; which is not supported.
+
+define fp128 @test_folding() {
+; CHECK-LABEL: test_folding:
+  %l = alloca i32
+  store i32 42, i32* %l
+  %val = load i32* %l
+  %fpval = sitofp i32 %val to fp128
+  ; If the value is loaded from a constant pool into an fp128, it's been folded
+  ; successfully.
+; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}},
+  ret fp128 %fpval
+}
diff --git a/test/CodeGen/ARM64/fp128.ll b/test/CodeGen/ARM64/fp128.ll
new file mode 100644
index 0000000000..21eb8930cb
--- /dev/null
+++ b/test/CodeGen/ARM64/fp128.ll
@@ -0,0 +1,274 @@
+; RUN: llc -mtriple=arm64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@lhs = global fp128 zeroinitializer, align 16
+@rhs = global fp128 zeroinitializer, align 16
+
+define fp128 @test_add() {
+; CHECK-LABEL: test_add:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fadd fp128 %lhs, %rhs
+; CHECK: bl __addtf3
+  ret fp128 %val
+}
+
+define fp128 @test_sub() {
+; CHECK-LABEL: test_sub:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fsub fp128 %lhs, %rhs
+; CHECK: bl __subtf3
+  ret fp128 %val
+}
+
+define fp128 @test_mul() {
+; CHECK-LABEL: test_mul:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fmul fp128 %lhs, %rhs
+; CHECK: bl __multf3
+  ret fp128 %val
+}
+
+define fp128 @test_div() {
+; CHECK-LABEL: test_div:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fdiv fp128 %lhs, %rhs
+; CHECK: bl __divtf3
+  ret fp128 %val
+}
+
+@var32 = global i32 0
+@var64 = global i64 0
+
+define void @test_fptosi() {
+; CHECK-LABEL: test_fptosi:
+  %val = load fp128* @lhs, align 16
+
+  %val32 = fptosi fp128 %val to i32
+  store i32 %val32, i32* @var32
+; CHECK: bl __fixtfsi
+
+  %val64 = fptosi fp128 %val to i64
+  store i64 %val64, i64* @var64
+; CHECK: bl __fixtfdi
+
+  ret void
+}
+
+define void @test_fptoui() {
+; CHECK-LABEL: test_fptoui:
+  %val = load fp128* @lhs, align 16
+
+  %val32 = fptoui fp128 %val to i32
+  store i32 %val32, i32* @var32
+; CHECK: bl __fixunstfsi
+
+  %val64 = fptoui fp128 %val to i64
+  store i64 %val64, i64* @var64
+; CHECK: bl __fixunstfdi
+
+  ret void
+}
+
+define void @test_sitofp() {
+; CHECK-LABEL: test_sitofp:
+
+  %src32 = load i32* @var32
+  %val32 = sitofp i32 %src32 to fp128
+  store volatile fp128 %val32, fp128* @lhs
+; CHECK: bl __floatsitf
+
+  %src64 = load i64* @var64
+  %val64 = sitofp i64 %src64 to fp128
+  store volatile fp128 %val64, fp128* @lhs
+; CHECK: bl __floatditf
+
+  ret void
+}
+
+define void @test_uitofp() {
+; CHECK-LABEL: test_uitofp:
+
+  %src32 = load i32* @var32
+  %val32 = uitofp i32 %src32 to fp128
+  store volatile fp128 %val32, fp128* @lhs
+; CHECK: bl __floatunsitf
+
+  %src64 = load i64* @var64
+  %val64 = uitofp i64 %src64 to fp128
+  store volatile fp128 %val64, fp128* @lhs
+; CHECK: bl __floatunditf
+
+  ret void
+}
+
+define i1 @test_setcc1() {
+; CHECK-LABEL: test_setcc1:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+; Technically, everything after the call to __letf2 is redundant, but we'll let
+; LLVM have its fun for now.
+  %val = fcmp ole fp128 %lhs, %rhs
+; CHECK: bl __letf2
+; CHECK: cmp w0, #0
+; CHECK: csinc w0, wzr, wzr, gt
+
+  ret i1 %val
+; CHECK: ret
+}
+
+define i1 @test_setcc2() {
+; CHECK-LABEL: test_setcc2:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  %val = fcmp ugt fp128 %lhs, %rhs
+; CHECK: bl      __gttf2
+; CHECK: cmp     w0, #0
+; CHECK: csinc   [[GT:w[0-9]+]], wzr, wzr, le
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp     w0, #0
+; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+; CHECK: orr     w0, [[UNORDERED]], [[GT]]
+
+  ret i1 %val
+; CHECK: ret
+}
+
+define i32 @test_br_cc() {
+; CHECK-LABEL: test_br_cc:
+
+  %lhs = load fp128* @lhs, align 16
+  %rhs = load fp128* @rhs, align 16
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:lhs]
+; CHECK: ldr q1, [{{x[0-9]+}}, :lo12:rhs]
+
+  ; olt == !uge, which LLVM unfortunately "optimizes" this to.
+  %cond = fcmp olt fp128 %lhs, %rhs
+; CHECK: bl      __getf2
+; CHECK: cmp     w0, #0
+; CHECK: csinc   [[OGE:w[0-9]+]], wzr, wzr, lt
+
+; CHECK: bl      __unordtf2
+; CHECK: cmp     w0, #0
+; CHECK: csinc   [[UNORDERED:w[0-9]+]], wzr, wzr, eq
+
+; CHECK: orr     [[UGE:w[0-9]+]], [[UNORDERED]], [[OGE]]
+; CHECK: cbnz [[UGE]], [[RET29:.LBB[0-9]+_[0-9]+]]
+  br i1 %cond, label %iftrue, label %iffalse
+
+iftrue:
+  ret i32 42
+; CHECK-NEXT: BB#
+; CHECK-NEXT: movz w0, #42
+; CHECK-NEXT: b [[REALRET:.LBB[0-9]+_[0-9]+]]
+
+iffalse:
+  ret i32 29
+; CHECK: [[RET29]]:
+; CHECK-NEXT: movz w0, #29
+; CHECK-NEXT: [[REALRET]]:
+; CHECK: ret
+}
+
+define void @test_select(i1 %cond, fp128 %lhs, fp128 %rhs) {
+; CHECK-LABEL: test_select:
+
+  %val = select i1 %cond, fp128 %lhs, fp128 %rhs
+  store fp128 %val, fp128* @lhs, align 16
+; CHECK: and [[BIT:w[0-9]+]], w0, #0x1
+; CHECK: cmp [[BIT]], #0
+; CHECK-NEXT: b.eq [[IFFALSE:.LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT: BB#
+; CHECK-NEXT: orr v[[VAL:[0-9]+]].16b, v0.16b, v0.16b
+; CHECK-NEXT: [[IFFALSE]]:
+; CHECK: str q[[VAL]], [{{x[0-9]+}}, :lo12:lhs]
+  ret void
+; CHECK: ret
+}
+
+@varfloat = global float 0.0, align 4
+@vardouble = global double 0.0, align 8
+
+define void @test_round() {
+; CHECK-LABEL: test_round:
+
+  %val = load fp128* @lhs, align 16
+
+  %float = fptrunc fp128 %val to float
+  store float %float, float* @varfloat, align 4
+; CHECK: bl __trunctfsf2
+; CHECK: str s0, [{{x[0-9]+}}, :lo12:varfloat]
+
+  %double = fptrunc fp128 %val to double
+  store double %double, double* @vardouble, align 8
+; CHECK: bl __trunctfdf2
+; CHECK: str d0, [{{x[0-9]+}}, :lo12:vardouble]
+
+  ret void
+}
+
+define void @test_extend() {
+; CHECK-LABEL: test_extend:
+
+  %val = load fp128* @lhs, align 16
+
+  %float = load float* @varfloat
+  %fromfloat = fpext float %float to fp128
+  store volatile fp128 %fromfloat, fp128* @lhs, align 16
+; CHECK: bl __extendsftf2
+; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
+
+  %double = load double* @vardouble
+  %fromdouble = fpext double %double to fp128
+  store volatile fp128 %fromdouble, fp128* @lhs, align 16
+; CHECK: bl __extenddftf2
+; CHECK: str q0, [{{x[0-9]+}}, :lo12:lhs]
+
+  ret void
+; CHECK: ret
+}
+
+define fp128 @test_neg(fp128 %in) {
+; CHECK: [[MINUS0:.LCPI[0-9]+_0]]:
+; Make sure the weird hex constant below *is* -0.0
+; CHECK-NEXT: fp128 -0
+
+; CHECK-LABEL: test_neg:
+
+  ; Could in principle be optimized to fneg which we can't select, this makes
+  ; sure that doesn't happen.
+  %ret = fsub fp128 0xL00000000000000008000000000000000, %in
+; CHECK: orr v1.16b, v0.16b, v0.16b
+; CHECK: ldr q0, [{{x[0-9]+}}, :lo12:[[MINUS0]]]
+; CHECK: bl __subtf3
+
+  ret fp128 %ret
+; CHECK: ret
+}
diff --git a/test/CodeGen/ARM64/frame-index.ll b/test/CodeGen/ARM64/frame-index.ll
new file mode 100644
index 0000000000..4a91ff31d8
--- /dev/null
+++ b/test/CodeGen/ARM64/frame-index.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; rdar://11935841
+
+define void @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: add x{{[0-9]+}}, sp
+; CHECK: stp x28, x27, [sp, #-16]!
+  %v = alloca [288 x i32], align 4
+  unreachable
+}
diff --git a/test/CodeGen/ARM64/frameaddr.ll b/test/CodeGen/ARM64/frameaddr.ll
new file mode 100644
index 0000000000..d0635adfe7
--- /dev/null
+++ b/test/CodeGen/ARM64/frameaddr.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i8* @t() nounwind {
+entry:
+; CHECK-LABEL: t:
+; CHECK: stp fp, lr, [sp, #-16]!
+; CHECK: mov fp, sp
+; CHECK: mov x0, fp
+; CHECK: ldp fp, lr, [sp], #16
+; CHECK: ret
+	%0 = call i8* @llvm.frameaddress(i32 0)
+        ret i8* %0
+}
+
+declare i8* @llvm.frameaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/global-address.ll b/test/CodeGen/ARM64/global-address.ll
new file mode 100644
index 0000000000..005f414f87
--- /dev/null
+++ b/test/CodeGen/ARM64/global-address.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; rdar://9618644
+
+@G = external global i32
+
+define i32 @test(i32 %off) nounwind {
+; CHECK-LABEL: test:
+; CHECK: adrp x[[REG:[0-9]+]], _G@GOTPAGE
+; CHECK: ldr  x[[REG2:[0-9]+]], [x[[REG]], _G@GOTPAGEOFF]
+; CHECK: add w0, w[[REG2]], w0
+  %tmp = ptrtoint i32* @G to i32
+  %tmp1 = add i32 %tmp, %off
+  ret i32 %tmp1
+}
diff --git a/test/CodeGen/ARM64/hello.ll b/test/CodeGen/ARM64/hello.ll
new file mode 100644
index 0000000000..f870fff688
--- /dev/null
+++ b/test/CodeGen/ARM64/hello.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s --check-prefix=CHECK-LINUX
+
+; CHECK-LABEL: main:
+; CHECK:	stp	fp, lr, [sp, #-16]!
+; CHECK-NEXT:	mov	fp, sp
+; CHECK-NEXT:	sub	sp, sp, #16
+; CHECK-NEXT:	stur	wzr, [fp, #-4]
+; CHECK:	adrp	x0, L_.str@PAGE
+; CHECK:	add	x0, x0, L_.str@PAGEOFF
+; CHECK-NEXT:	bl	_puts
+; CHECK-NEXT:	mov	sp, fp
+; CHECK-NEXT:	ldp	fp, lr, [sp], #16
+; CHECK-NEXT:	ret
+
+; CHECK-LINUX-LABEL: main:
+; CHECK-LINUX:	stp	fp, lr, [sp, #-16]!
+; CHECK-LINUX-NEXT:	mov	fp, sp
+; CHECK-LINUX-NEXT:	sub	sp, sp, #16
+; CHECK-LINUX-NEXT:	stur	wzr, [fp, #-4]
+; CHECK-LINUX:	adrp	x0, .L.str
+; CHECK-LINUX:	add	x0, x0, :lo12:.L.str
+; CHECK-LINUX-NEXT:	bl	puts
+; CHECK-LINUX-NEXT:	mov	sp, fp
+; CHECK-LINUX-NEXT:	ldp	fp, lr, [sp], #16
+; CHECK-LINUX-NEXT:	ret
+
+@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00"
+
+define i32 @main() nounwind ssp {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 @puts(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0))
+  ret i32 %call
+}
+
+declare i32 @puts(i8*)
diff --git a/test/CodeGen/ARM64/i16-subreg-extract.ll b/test/CodeGen/ARM64/i16-subreg-extract.ll
new file mode 100644
index 0000000000..fc2e8b58ac
--- /dev/null
+++ b/test/CodeGen/ARM64/i16-subreg-extract.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @foo(<4 x i16>* %__a) nounwind {
+; CHECK-LABEL: foo:
+; CHECK: umov.h w{{[0-9]+}}, v{{[0-9]+}}[0]
+  %tmp18 = load <4 x i16>* %__a, align 8
+  %vget_lane = extractelement <4 x i16> %tmp18, i32 0
+  %conv = zext i16 %vget_lane to i32
+  %mul = mul nsw i32 3, %conv
+  ret i32 %mul
+}
+
diff --git a/test/CodeGen/ARM64/icmp-opt.ll b/test/CodeGen/ARM64/icmp-opt.ll
new file mode 100644
index 0000000000..f88399bb51
--- /dev/null
+++ b/test/CodeGen/ARM64/icmp-opt.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; Optimize (x > -1) to (x >= 0) etc.
+; Optimize (cmp (add / sub), 0): eliminate the subs used to update flag
+;   for comparison only
+; rdar://10233472
+
+define i32 @t1(i64 %a) nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: movn
+; CHECK: cmp  x0, #0
+; CHECK: csinc w0, wzr, wzr, lt
+  %cmp = icmp sgt i64 %a, -1
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/ARM64/illegal-float-ops.ll b/test/CodeGen/ARM64/illegal-float-ops.ll
new file mode 100644
index 0000000000..a122079744
--- /dev/null
+++ b/test/CodeGen/ARM64/illegal-float-ops.ll
@@ -0,0 +1,247 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+
+@varfloat = global float 0.0
+@vardouble = global double 0.0
+@varfp128 = global fp128 zeroinitializer
+
+declare float @llvm.cos.f32(float)
+declare double @llvm.cos.f64(double)
+declare fp128 @llvm.cos.f128(fp128)
+
+define void @test_cos(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_cos:
+
+   %cosfloat = call float @llvm.cos.f32(float %float)
+   store float %cosfloat, float* @varfloat
+; CHECK: bl cosf
+
+   %cosdouble = call double @llvm.cos.f64(double %double)
+   store double %cosdouble, double* @vardouble
+; CHECK: bl cos
+
+   %cosfp128 = call fp128 @llvm.cos.f128(fp128 %fp128)
+   store fp128 %cosfp128, fp128* @varfp128
+; CHECK: bl cosl
+
+  ret void
+}
+
+declare float @llvm.exp.f32(float)
+declare double @llvm.exp.f64(double)
+declare fp128 @llvm.exp.f128(fp128)
+
+define void @test_exp(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_exp:
+
+   %expfloat = call float @llvm.exp.f32(float %float)
+   store float %expfloat, float* @varfloat
+; CHECK: bl expf
+
+   %expdouble = call double @llvm.exp.f64(double %double)
+   store double %expdouble, double* @vardouble
+; CHECK: bl exp
+
+   %expfp128 = call fp128 @llvm.exp.f128(fp128 %fp128)
+   store fp128 %expfp128, fp128* @varfp128
+; CHECK: bl expl
+
+  ret void
+}
+
+declare float @llvm.exp2.f32(float)
+declare double @llvm.exp2.f64(double)
+declare fp128 @llvm.exp2.f128(fp128)
+
+define void @test_exp2(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_exp2:
+
+   %exp2float = call float @llvm.exp2.f32(float %float)
+   store float %exp2float, float* @varfloat
+; CHECK: bl exp2f
+
+   %exp2double = call double @llvm.exp2.f64(double %double)
+   store double %exp2double, double* @vardouble
+; CHECK: bl exp2
+
+   %exp2fp128 = call fp128 @llvm.exp2.f128(fp128 %fp128)
+   store fp128 %exp2fp128, fp128* @varfp128
+; CHECK: bl exp2l
+  ret void
+
+}
+
+declare float @llvm.log.f32(float)
+declare double @llvm.log.f64(double)
+declare fp128 @llvm.log.f128(fp128)
+
+define void @test_log(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log:
+
+   %logfloat = call float @llvm.log.f32(float %float)
+   store float %logfloat, float* @varfloat
+; CHECK: bl logf
+
+   %logdouble = call double @llvm.log.f64(double %double)
+   store double %logdouble, double* @vardouble
+; CHECK: bl log
+
+   %logfp128 = call fp128 @llvm.log.f128(fp128 %fp128)
+   store fp128 %logfp128, fp128* @varfp128
+; CHECK: bl logl
+
+  ret void
+}
+
+declare float @llvm.log2.f32(float)
+declare double @llvm.log2.f64(double)
+declare fp128 @llvm.log2.f128(fp128)
+
+define void @test_log2(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log2:
+
+   %log2float = call float @llvm.log2.f32(float %float)
+   store float %log2float, float* @varfloat
+; CHECK: bl log2f
+
+   %log2double = call double @llvm.log2.f64(double %double)
+   store double %log2double, double* @vardouble
+; CHECK: bl log2
+
+   %log2fp128 = call fp128 @llvm.log2.f128(fp128 %fp128)
+   store fp128 %log2fp128, fp128* @varfp128
+; CHECK: bl log2l
+  ret void
+
+}
+
+declare float @llvm.log10.f32(float)
+declare double @llvm.log10.f64(double)
+declare fp128 @llvm.log10.f128(fp128)
+
+define void @test_log10(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_log10:
+
+   %log10float = call float @llvm.log10.f32(float %float)
+   store float %log10float, float* @varfloat
+; CHECK: bl log10f
+
+   %log10double = call double @llvm.log10.f64(double %double)
+   store double %log10double, double* @vardouble
+; CHECK: bl log10
+
+   %log10fp128 = call fp128 @llvm.log10.f128(fp128 %fp128)
+   store fp128 %log10fp128, fp128* @varfp128
+; CHECK: bl log10l
+
+  ret void
+}
+
+declare float @llvm.sin.f32(float)
+declare double @llvm.sin.f64(double)
+declare fp128 @llvm.sin.f128(fp128)
+
+define void @test_sin(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_sin:
+
+   %sinfloat = call float @llvm.sin.f32(float %float)
+   store float %sinfloat, float* @varfloat
+; CHECK: bl sinf
+
+   %sindouble = call double @llvm.sin.f64(double %double)
+   store double %sindouble, double* @vardouble
+; CHECK: bl sin
+
+   %sinfp128 = call fp128 @llvm.sin.f128(fp128 %fp128)
+   store fp128 %sinfp128, fp128* @varfp128
+; CHECK: bl sinl
+  ret void
+
+}
+
+declare float @llvm.pow.f32(float, float)
+declare double @llvm.pow.f64(double, double)
+declare fp128 @llvm.pow.f128(fp128, fp128)
+
+define void @test_pow(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_pow:
+
+   %powfloat = call float @llvm.pow.f32(float %float, float %float)
+   store float %powfloat, float* @varfloat
+; CHECK: bl powf
+
+   %powdouble = call double @llvm.pow.f64(double %double, double %double)
+   store double %powdouble, double* @vardouble
+; CHECK: bl pow
+
+   %powfp128 = call fp128 @llvm.pow.f128(fp128 %fp128, fp128 %fp128)
+   store fp128 %powfp128, fp128* @varfp128
+; CHECK: bl powl
+
+  ret void
+}
+
+declare float @llvm.powi.f32(float, i32)
+declare double @llvm.powi.f64(double, i32)
+declare fp128 @llvm.powi.f128(fp128, i32)
+
+define void @test_powi(float %float, double %double, i32 %exponent, fp128 %fp128) {
+; CHECK-LABEL: test_powi:
+
+   %powifloat = call float @llvm.powi.f32(float %float, i32 %exponent)
+   store float %powifloat, float* @varfloat
+; CHECK: bl __powisf2
+
+   %powidouble = call double @llvm.powi.f64(double %double, i32 %exponent)
+   store double %powidouble, double* @vardouble
+; CHECK: bl __powidf2
+
+   %powifp128 = call fp128 @llvm.powi.f128(fp128 %fp128, i32 %exponent)
+   store fp128 %powifp128, fp128* @varfp128
+; CHECK: bl __powitf2
+  ret void
+
+}
+
+define void @test_frem(float %float, double %double, fp128 %fp128) {
+; CHECK-LABEL: test_frem:
+
+  %fremfloat = frem float %float, %float
+  store float %fremfloat, float* @varfloat
+; CHECK: bl fmodf
+
+  %fremdouble = frem double %double, %double
+  store double %fremdouble, double* @vardouble
+; CHECK: bl fmod
+
+  %fremfp128 = frem fp128 %fp128, %fp128
+  store fp128 %fremfp128, fp128* @varfp128
+; CHECK: bl fmodl
+
+  ret void
+}
+
+declare fp128 @llvm.fma.f128(fp128, fp128, fp128)
+
+define void @test_fma(fp128 %fp128) {
+; CHECK-LABEL: test_fma:
+
+  %fmafp128 = call fp128 @llvm.fma.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+  store fp128 %fmafp128, fp128* @varfp128
+; CHECK: bl fmal
+
+  ret void
+}
+
+declare fp128 @llvm.fmuladd.f128(fp128, fp128, fp128)
+
+define void @test_fmuladd(fp128 %fp128) {
+; CHECK-LABEL: test_fmuladd:
+
+  %fmuladdfp128 = call fp128 @llvm.fmuladd.f128(fp128 %fp128, fp128 %fp128, fp128 %fp128)
+  store fp128 %fmuladdfp128, fp128* @varfp128
+; CHECK-NOT: bl fmal
+; CHECK: bl __multf3
+; CHECK: bl __addtf3
+
+  ret void
+}
diff --git a/test/CodeGen/ARM64/indexed-memory.ll b/test/CodeGen/ARM64/indexed-memory.ll
new file mode 100644
index 0000000000..e390ed7ece
--- /dev/null
+++ b/test/CodeGen/ARM64/indexed-memory.ll
@@ -0,0 +1,351 @@
+; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
+
+define void @store64(i64** nocapture %out, i64 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store64:
+; CHECK: str x{{[0-9+]}}, [x{{[0-9+]}}], #8
+; CHECK: ret
+  %tmp = load i64** %out, align 8
+  %incdec.ptr = getelementptr inbounds i64* %tmp, i64 1
+  store i64 %spacing, i64* %tmp, align 4
+  store i64* %incdec.ptr, i64** %out, align 8
+  ret void
+}
+
+define void @store32(i32** nocapture %out, i32 %index, i32 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store32:
+; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load i32** %out, align 8
+  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
+  store i32 %spacing, i32* %tmp, align 4
+  store i32* %incdec.ptr, i32** %out, align 8
+  ret void
+}
+
+define void @store16(i16** nocapture %out, i16 %index, i16 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store16:
+; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+  %tmp = load i16** %out, align 8
+  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
+  store i16 %spacing, i16* %tmp, align 4
+  store i16* %incdec.ptr, i16** %out, align 8
+  ret void
+}
+
+define void @store8(i8** nocapture %out, i8 %index, i8 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: store8:
+; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
+; CHECK: ret
+  %tmp = load i8** %out, align 8
+  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
+  store i8 %spacing, i8* %tmp, align 4
+  store i8* %incdec.ptr, i8** %out, align 8
+  ret void
+}
+
+define void @truncst64to32(i32** nocapture %out, i32 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to32:
+; CHECK: str w{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load i32** %out, align 8
+  %incdec.ptr = getelementptr inbounds i32* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i32
+  store i32 %trunc, i32* %tmp, align 4
+  store i32* %incdec.ptr, i32** %out, align 8
+  ret void
+}
+
+define void @truncst64to16(i16** nocapture %out, i16 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to16:
+; CHECK: strh w{{[0-9+]}}, [x{{[0-9+]}}], #2
+; CHECK: ret
+  %tmp = load i16** %out, align 8
+  %incdec.ptr = getelementptr inbounds i16* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i16
+  store i16 %trunc, i16* %tmp, align 4
+  store i16* %incdec.ptr, i16** %out, align 8
+  ret void
+}
+
+define void @truncst64to8(i8** nocapture %out, i8 %index, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: truncst64to8:
+; CHECK: strb w{{[0-9+]}}, [x{{[0-9+]}}], #1
+; CHECK: ret
+  %tmp = load i8** %out, align 8
+  %incdec.ptr = getelementptr inbounds i8* %tmp, i64 1
+  %trunc = trunc i64 %spacing to i8
+  store i8 %trunc, i8* %tmp, align 4
+  store i8* %incdec.ptr, i8** %out, align 8
+  ret void
+}
+
+
+define void @storef32(float** nocapture %out, float %index, float %spacing) nounwind noinline ssp {
+; CHECK-LABEL: storef32:
+; CHECK: str s{{[0-9+]}}, [x{{[0-9+]}}], #4
+; CHECK: ret
+  %tmp = load float** %out, align 8
+  %incdec.ptr = getelementptr inbounds float* %tmp, i64 1
+  store float %spacing, float* %tmp, align 4
+  store float* %incdec.ptr, float** %out, align 8
+  ret void
+}
+
+define void @storef64(double** nocapture %out, double %index, double %spacing) nounwind noinline ssp {
+; CHECK-LABEL: storef64:
+; CHECK: str d{{[0-9+]}}, [x{{[0-9+]}}], #8
+; CHECK: ret
+  %tmp = load double** %out, align 8
+  %incdec.ptr = getelementptr inbounds double* %tmp, i64 1
+  store double %spacing, double* %tmp, align 4
+  store double* %incdec.ptr, double** %out, align 8
+  ret void
+}
+
+define double * @pref64(double** nocapture %out, double %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pref64:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     d0, [x0, #32]!
+; CHECK-NEXT: ret
+  %tmp = load double** %out, align 8
+  %ptr = getelementptr inbounds double* %tmp, i64 4
+  store double %spacing, double* %ptr, align 4
+  ret double *%ptr
+}
+
+define float * @pref32(float** nocapture %out, float %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pref32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     s0, [x0, #12]!
+; CHECK-NEXT: ret
+  %tmp = load float** %out, align 8
+  %ptr = getelementptr inbounds float* %tmp, i64 3
+  store float %spacing, float* %ptr, align 4
+  ret float *%ptr
+}
+
+define i64 * @pre64(i64** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre64:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     x1, [x0, #16]!
+; CHECK-NEXT: ret
+  %tmp = load i64** %out, align 8
+  %ptr = getelementptr inbounds i64* %tmp, i64 2
+  store i64 %spacing, i64* %ptr, align 4
+  ret i64 *%ptr
+}
+
+define i32 * @pre32(i32** nocapture %out, i32 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     w1, [x0, #8]!
+; CHECK-NEXT: ret
+  %tmp = load i32** %out, align 8
+  %ptr = getelementptr inbounds i32* %tmp, i64 2
+  store i32 %spacing, i32* %ptr, align 4
+  ret i32 *%ptr
+}
+
+define i16 * @pre16(i16** nocapture %out, i16 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre16:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strh    w1, [x0, #4]!
+; CHECK-NEXT: ret
+  %tmp = load i16** %out, align 8
+  %ptr = getelementptr inbounds i16* %tmp, i64 2
+  store i16 %spacing, i16* %ptr, align 4
+  ret i16 *%ptr
+}
+
+define i8 * @pre8(i8** nocapture %out, i8 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pre8:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strb    w1, [x0, #2]!
+; CHECK-NEXT: ret
+  %tmp = load i8** %out, align 8
+  %ptr = getelementptr inbounds i8* %tmp, i64 2
+  store i8 %spacing, i8* %ptr, align 4
+  ret i8 *%ptr
+}
+
+define i32 * @pretrunc64to32(i32** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to32:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: str     w1, [x0, #8]!
+; CHECK-NEXT: ret
+  %tmp = load i32** %out, align 8
+  %ptr = getelementptr inbounds i32* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i32
+  store i32 %trunc, i32* %ptr, align 4
+  ret i32 *%ptr
+}
+
+define i16 * @pretrunc64to16(i16** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to16:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strh    w1, [x0, #4]!
+; CHECK-NEXT: ret
+  %tmp = load i16** %out, align 8
+  %ptr = getelementptr inbounds i16* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i16
+  store i16 %trunc, i16* %ptr, align 4
+  ret i16 *%ptr
+}
+
+define i8 * @pretrunc64to8(i8** nocapture %out, i64 %spacing) nounwind noinline ssp {
+; CHECK-LABEL: pretrunc64to8:
+; CHECK: ldr     x0, [x0]
+; CHECK-NEXT: strb    w1, [x0, #2]!
+; CHECK-NEXT: ret
+  %tmp = load i8** %out, align 8
+  %ptr = getelementptr inbounds i8* %tmp, i64 2
+  %trunc = trunc i64 %spacing to i8
+  store i8 %trunc, i8* %ptr, align 4
+  ret i8 *%ptr
+}
+
+;-----
+; Pre-indexed loads
+;-----
+define double* @preidxf64(double* %src, double* %out) {
+; CHECK-LABEL: preidxf64:
+; CHECK: ldr     d0, [x0, #8]!
+; CHECK: str     d0, [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds double* %src, i64 1
+  %tmp = load double* %ptr, align 4
+  store double %tmp, double* %out, align 4
+  ret double* %ptr
+}
+
+define float* @preidxf32(float* %src, float* %out) {
+; CHECK-LABEL: preidxf32:
+; CHECK: ldr     s0, [x0, #4]!
+; CHECK: str     s0, [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds float* %src, i64 1
+  %tmp = load float* %ptr, align 4
+  store float %tmp, float* %out, align 4
+  ret float* %ptr
+}
+
+define i64* @preidx64(i64* %src, i64* %out) {
+; CHECK-LABEL: preidx64:
+; CHECK: ldr     x[[REG:[0-9]+]], [x0, #8]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i64* %src, i64 1
+  %tmp = load i64* %ptr, align 4
+  store i64 %tmp, i64* %out, align 4
+  ret i64* %ptr
+}
+
+define i32* @preidx32(i32* %src, i32* %out) {
+; CHECK: ldr     w[[REG:[0-9]+]], [x0, #4]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i32* %src, i64 1
+  %tmp = load i32* %ptr, align 4
+  store i32 %tmp, i32* %out, align 4
+  ret i32* %ptr
+}
+
+define i16* @preidx16zext32(i16* %src, i32* %out) {
+; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = zext i16 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i16* %ptr
+}
+
+define i16* @preidx16zext64(i16* %src, i64* %out) {
+; CHECK: ldrh    w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = zext i16 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i16* %ptr
+}
+
+define i8* @preidx8zext32(i8* %src, i32* %out) {
+; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = zext i8 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i8* %ptr
+}
+
+define i8* @preidx8zext64(i8* %src, i64* %out) {
+; CHECK: ldrb    w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = zext i8 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i8* %ptr
+}
+
+define i32* @preidx32sext64(i32* %src, i64* %out) {
+; CHECK: ldrsw   x[[REG:[0-9]+]], [x0, #4]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i32* %src, i64 1
+  %tmp = load i32* %ptr, align 4
+  %ext = sext i32 %tmp to i64
+  store i64 %ext, i64* %out, align 8
+  ret i32* %ptr
+}
+
+define i16* @preidx16sext32(i16* %src, i32* %out) {
+; CHECK: ldrsh   w[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = sext i16 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i16* %ptr
+}
+
+define i16* @preidx16sext64(i16* %src, i64* %out) {
+; CHECK: ldrsh   x[[REG:[0-9]+]], [x0, #2]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i16* %src, i64 1
+  %tmp = load i16* %ptr, align 4
+  %ext = sext i16 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i16* %ptr
+}
+
+define i8* @preidx8sext32(i8* %src, i32* %out) {
+; CHECK: ldrsb   w[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     w[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = sext i8 %tmp to i32
+  store i32 %ext, i32* %out, align 4
+  ret i8* %ptr
+}
+
+define i8* @preidx8sext64(i8* %src, i64* %out) {
+; CHECK: ldrsb   x[[REG:[0-9]+]], [x0, #1]!
+; CHECK: str     x[[REG]], [x1]
+; CHECK: ret
+  %ptr = getelementptr inbounds i8* %src, i64 1
+  %tmp = load i8* %ptr, align 4
+  %ext = sext i8 %tmp to i64
+  store i64 %ext, i64* %out, align 4
+  ret i8* %ptr
+}
diff --git a/test/CodeGen/ARM64/inline-asm-error-I.ll b/test/CodeGen/ARM64/inline-asm-error-I.ll
new file mode 100644
index 0000000000..a7aaf9e55d
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-error-I.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'I'
+
+define i32 @constraint_I(i32 %i, i32 %j) nounwind ssp {
+entry:
+  %0 = tail call i32 asm sideeffect "add $0, $1, $2", "=r,r,I"(i32 %i, i32 4097) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM64/inline-asm-error-J.ll b/test/CodeGen/ARM64/inline-asm-error-J.ll
new file mode 100644
index 0000000000..077e1b80d9
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-error-J.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'J'
+
+define i32 @constraint_J(i32 %i, i32 %j) nounwind ssp {
+entry:
+  %0 = tail call i32 asm sideeffect "sub $0, $1, $2", "=r,r,J"(i32 %i, i32 2) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM64/inline-asm-error-K.ll b/test/CodeGen/ARM64/inline-asm-error-K.ll
new file mode 100644
index 0000000000..2a7f9619de
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-error-K.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'K'
+
+define i32 @constraint_K(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,K"(i32 %i, i32 -1) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM64/inline-asm-error-L.ll b/test/CodeGen/ARM64/inline-asm-error-L.ll
new file mode 100644
index 0000000000..1701943419
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-error-L.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'L'
+
+define i32 @constraint_L(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "eor $0, $1, $2", "=r,r,L"(i32 %i, i64 -1) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM64/inline-asm-error-M.ll b/test/CodeGen/ARM64/inline-asm-error-M.ll
new file mode 100644
index 0000000000..952bf6042c
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-error-M.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'M'
+
+define i32 @constraint_M(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,M"(i32 305418240) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM64/inline-asm-error-N.ll b/test/CodeGen/ARM64/inline-asm-error-N.ll
new file mode 100644
index 0000000000..b4a199f160
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-error-N.ll
@@ -0,0 +1,11 @@
+; RUN: not llc -march=arm64 < %s  2> %t
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+; Check for at least one invalid constant.
+; CHECK-ERRORS:	error: invalid operand for inline asm constraint 'N'
+
+define i32 @constraint_N(i32 %i, i32 %j) nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect "movk $0, $1", "=r,N"(i64 1311761352401879040) nounwind
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll b/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
new file mode 100644
index 0000000000..6bfce8f8f6
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm-zero-reg-error.ll
@@ -0,0 +1,11 @@
+; RUN: not llc < %s -march=arm64 2>&1 | FileCheck %s
+
+
+; The 'z' constraint allocates either xzr or wzr, but obviously an input of 1 is
+; incompatible.
+define void @test_bad_zero_reg() {
+  tail call void asm sideeffect "USE($0)", "z"(i32 1) nounwind
+; CHECK: error: invalid operand for inline asm constraint 'z'
+
+  ret void
+}
diff --git a/test/CodeGen/ARM64/inline-asm.ll b/test/CodeGen/ARM64/inline-asm.ll
new file mode 100644
index 0000000000..e64507870f
--- /dev/null
+++ b/test/CodeGen/ARM64/inline-asm.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -no-integrated-as | FileCheck %s
+
+; rdar://9167275
+
+define i32 @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: mov {{w[0-9]+}}, 7
+  %0 = tail call i32 asm "mov ${0:w}, 7", "=r"() nounwind
+  ret i32 %0
+}
+
+define i64 @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: mov {{x[0-9]+}}, 7
+  %0 = tail call i64 asm "mov $0, 7", "=r"() nounwind
+  ret i64 %0
+}
+
+define i64 @t3() nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: mov {{w[0-9]+}}, 7
+  %0 = tail call i64 asm "mov ${0:w}, 7", "=r"() nounwind
+  ret i64 %0
+}
+
+; rdar://9281206
+
+define void @t4(i64 %op) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: mov x0, {{x[0-9]+}}; svc #0
+  %0 = tail call i64 asm sideeffect "mov x0, $1; svc #0;", "=r,r,r,~{x0}"(i64 %op, i64 undef) nounwind
+  ret void
+}
+
+; rdar://9394290
+
+define float @t5(float %x) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: fadd {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  %0 = tail call float asm "fadd ${0:s}, ${0:s}, ${0:s}", "=w,0"(float %x) nounwind
+  ret float %0
+}
+
+; rdar://9553599
+
+define zeroext i8 @t6(i8* %src) nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: ldtrb {{w[0-9]+}}, [{{x[0-9]+}}]
+  %0 = tail call i8 asm "ldtrb ${0:w}, [$1]", "=r,r"(i8* %src) nounwind
+  ret i8 %0
+}
+
+define void @t7(i8* %f, i32 %g) nounwind {
+entry:
+  %f.addr = alloca i8*, align 8
+  store i8* %f, i8** %f.addr, align 8
+  ; CHECK-LABEL: t7:
+  ; CHECK: str {{w[0-9]+}}, [{{x[0-9]+}}]
+  call void asm "str ${1:w}, $0", "=*Q,r"(i8** %f.addr, i32 %g) nounwind
+  ret void
+}
+
+; rdar://10258229
+; ARM64TargetLowering::getRegForInlineAsmConstraint() should recognize 'v'
+; registers.
+define void @t8() nounwind ssp {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: stp {{d[0-9]+}}, {{d[0-9]+}}, [sp, #-16]
+  tail call void asm sideeffect "nop", "~{v8}"() nounwind
+  ret void
+}
+
+define i32 @constraint_I(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_I:
+  %0 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 16773120) nounwind
+  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #16773120
+  %1 = tail call i32 asm sideeffect "add ${0:w}, ${1:w}, $2", "=r,r,I"(i32 %i, i32 4096) nounwind
+  ; CHECK: add   {{w[0-9]+}}, {{w[0-9]+}}, #4096
+  ret i32 %1
+}
+
+define i32 @constraint_J(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_J:
+  %0 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -16773120) nounwind
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4278194176
+  %1 = tail call i32 asm sideeffect "sub ${0:w}, ${1:w}, $2", "=r,r,J"(i32 %i, i32 -1) nounwind
+  ; CHECK: sub   {{w[0-9]+}}, {{w[0-9]+}}, #4294967295
+  ret i32 %1
+}
+
+define i32 @constraint_KL(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_KL:
+  %0 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,K"(i32 %i, i32 255) nounwind
+  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #255
+  %1 = tail call i32 asm sideeffect "eor ${0:w}, ${1:w}, $2", "=r,r,L"(i32 %i, i64 16711680) nounwind
+  ; CHECK: eor {{w[0-9]+}}, {{w[0-9]+}}, #16711680
+  ret i32 %1
+}
+
+define i32 @constraint_MN(i32 %i, i32 %j) nounwind {
+entry:
+  ; CHECK-LABEL: constraint_MN:
+  %0 = tail call i32 asm sideeffect "movk ${0:w}, $1", "=r,M"(i32 65535) nounwind
+  ; CHECK: movk  {{w[0-9]+}}, #65535
+  %1 = tail call i32 asm sideeffect "movz ${0:w}, $1", "=r,N"(i64 0) nounwind
+  ; CHECK: movz  {{w[0-9]+}}, #0
+  ret i32 %1
+}
+
+define void @t9() nounwind {
+entry:
+  ; CHECK-LABEL: t9:
+  %data = alloca <2 x double>, align 16
+  %0 = load <2 x double>* %data, align 16
+  call void asm sideeffect "mov.2d v4, $0\0A", "w,~{v4}"(<2 x double> %0) nounwind
+  ; CHECK: mov.2d v4, {{v[0-9]+}}
+  ret void
+}
+
+define void @t10() nounwind {
+entry:
+  ; CHECK-LABEL: t10:
+  %data = alloca <2 x float>, align 8
+  %a = alloca [2 x float], align 4
+  %arraydecay = getelementptr inbounds [2 x float]* %a, i32 0, i32 0
+  %0 = load <2 x float>* %data, align 8
+  call void asm sideeffect "ldr ${1:q}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{q[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:d}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:s}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{s[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:h}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{h[0-9]+}}, [{{x[0-9]+}}]
+  call void asm sideeffect "ldr ${1:b}, [$0]\0A", "r,w"(float* %arraydecay, <2 x float> %0) nounwind
+  ; CHECK: ldr {{b[0-9]+}}, [{{x[0-9]+}}]
+  ret void
+}
+
+define void @t11() nounwind {
+entry:
+  ; CHECK-LABEL: t11:
+  %a = alloca i32, align 4
+  %0 = load i32* %a, align 4
+  call void asm sideeffect "mov ${1:x}, ${0:x}\0A", "r,i"(i32 %0, i32 0) nounwind
+  ; CHECK: mov xzr, {{x[0-9]+}}
+  %1 = load i32* %a, align 4
+  call void asm sideeffect "mov ${1:w}, ${0:w}\0A", "r,i"(i32 %1, i32 0) nounwind
+  ; CHECK: mov wzr, {{w[0-9]+}}
+  ret void
+}
+
+define void @t12() nounwind {
+entry:
+  ; CHECK-LABEL: t12:
+  %data = alloca <4 x float>, align 16
+  %0 = load <4 x float>* %data, align 16
+  call void asm sideeffect "mov.2d v4, $0\0A", "x,~{v4}"(<4 x float> %0) nounwind
+  ; CHECK mov.2d v4, {{v([0-9])|(1[0-5])}}
+  ret void
+}
+
+define void @t13() nounwind {
+entry:
+  ; CHECK-LABEL: t13:
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 1311673391471656960) nounwind
+  ; CHECK: mov x4, #1311673391471656960
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 -4662) nounwind
+  ; CHECK: mov x4, #-4662
+  tail call void asm sideeffect "mov x4, $0\0A", "N"(i64 4660) nounwind
+  ; CHECK: mov x4, #4660
+  call void asm sideeffect "mov x4, $0\0A", "N"(i64 -71777214294589696) nounwind
+  ; CHECK: mov x4, #-71777214294589696
+  ret void
+}
+
+define void @t14() nounwind {
+entry:
+  ; CHECK-LABEL: t14:
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 305397760) nounwind
+  ; CHECK: mov w4, #305397760
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 -4662) nounwind
+  ; CHECK: mov w4, #4294962634
+  tail call void asm sideeffect "mov w4, $0\0A", "M"(i32 4660) nounwind
+  ; CHECK: mov w4, #4660
+  call void asm sideeffect "mov w4, $0\0A", "M"(i32 -16711936) nounwind
+  ; CHECK: mov w4, #4278255360
+  ret void
+}
+
+define void @t15() nounwind {
+entry:
+  %0 = tail call double asm sideeffect "fmov $0, d8", "=r"() nounwind
+  ; CHECK: fmov {{x[0-9]+}}, d8
+  ret void
+}
+
+; rdar://problem/14285178
+
+define void @test_zero_reg(i32* %addr) {
+; CHECK-LABEL: test_zero_reg:
+
+  tail call void asm sideeffect "USE($0)", "z"(i32 0) nounwind
+; CHECK: USE(xzr)
+
+  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 0)
+; CHECK: USE(wzr)
+
+  tail call void asm sideeffect "USE(${0:w})", "zr"(i32 1)
+; CHECK: orr [[VAL1:w[0-9]+]], wzr, #0x1
+; CHECK: USE([[VAL1]])
+
+  tail call void asm sideeffect "USE($0), USE($1)", "z,z"(i32 0, i32 0) nounwind
+; CHECK: USE(xzr), USE(xzr)
+
+  tail call void asm sideeffect "USE($0), USE(${1:w})", "z,z"(i32 0, i32 0) nounwind
+; CHECK: USE(xzr), USE(wzr)
+
+  ret void
+}
diff --git a/test/CodeGen/ARM64/join-reserved.ll b/test/CodeGen/ARM64/join-reserved.ll
new file mode 100644
index 0000000000..e99168b5eb
--- /dev/null
+++ b/test/CodeGen/ARM64/join-reserved.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -verify-machineinstrs | FileCheck %s
+target triple = "arm64-apple-macosx10"
+
+; Make sure that a store to [sp] addresses off sp directly.
+; A move isn't necessary.
+; <rdar://problem/11492712>
+; CHECK-LABEL: g:
+; CHECK: str xzr, [sp]
+; CHECK: bl
+; CHECK: ret
+define void @g() nounwind ssp {
+entry:
+  tail call void (i32, ...)* @f(i32 0, i32 0) nounwind
+  ret void
+}
+
+declare void @f(i32, ...)
diff --git a/test/CodeGen/ARM64/jumptable.ll b/test/CodeGen/ARM64/jumptable.ll
new file mode 100644
index 0000000000..4635cfe585
--- /dev/null
+++ b/test/CodeGen/ARM64/jumptable.ll
@@ -0,0 +1,35 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s --check-prefix=CHECK-LINUX
+; <rdar://11417675>
+
+define void @sum(i32* %to) {
+entry:
+  switch i32 undef, label %exit [
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+  ]
+bb1:
+  store i32 undef, i32* %to
+  br label %exit
+bb2:
+  store i32 undef, i32* %to
+  br label %exit
+bb3:
+  store i32 undef, i32* %to
+  br label %exit
+bb4:
+  store i32 undef, i32* %to
+  br label %exit
+exit:
+  ret void
+}
+
+; CHECK-LABEL: sum:
+; CHECK: adrp    {{x[0-9]+}}, LJTI0_0@PAGE
+; CHECK:  add    {{x[0-9]+}}, {{x[0-9]+}}, LJTI0_0@PAGEOFF
+
+; CHECK-LINUX-LABEL: sum:
+; CHECK-LINUX: adrp    {{x[0-9]+}}, .LJTI0_0
+; CHECK-LINUX:  add    {{x[0-9]+}}, {{x[0-9]+}}, :lo12:.LJTI0_0
diff --git a/test/CodeGen/ARM64/ld1.ll b/test/CodeGen/ARM64/ld1.ll
new file mode 100644
index 0000000000..d1844bcc06
--- /dev/null
+++ b/test/CodeGen/ARM64/ld1.ll
@@ -0,0 +1,1254 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+%struct.__neon_int8x8x2_t = type { <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x3_t = type { <8 x i8>,  <8 x i8>,  <8 x i8> }
+%struct.__neon_int8x8x4_t = type { <8 x i8>,  <8 x i8>, <8 x i8>,  <8 x i8> }
+
+define %struct.__neon_int8x8x2_t @ld2_8b(i8* %A) nounwind {
+; CHECK: ld2_8b
+; Make sure we are loading into the results defined by the ABI (i.e., v0, v1)
+; and from the argument of the function also defined by ABI (i.e., x0)
+; CHECK ld2.8b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x2_t  %tmp2
+}
+
+define %struct.__neon_int8x8x3_t @ld3_8b(i8* %A) nounwind {
+; CHECK: ld3_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.8b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x3_t  %tmp2
+}
+
+define %struct.__neon_int8x8x4_t @ld4_8b(i8* %A) nounwind {
+; CHECK: ld4_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.8b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4.v8i8.p0i8(i8*) nounwind readonly
+
+%struct.__neon_int8x16x2_t = type { <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x3_t = type { <16 x i8>,  <16 x i8>,  <16 x i8> }
+%struct.__neon_int8x16x4_t = type { <16 x i8>,  <16 x i8>, <16 x i8>,  <16 x i8> }
+
+define %struct.__neon_int8x16x2_t @ld2_16b(i8* %A) nounwind {
+; CHECK: ld2_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.16b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3_16b(i8* %A) nounwind {
+; CHECK: ld3_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.16b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4_16b(i8* %A) nounwind {
+; CHECK: ld4_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.16b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8* %A)
+  ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4.v16i8.p0i8(i8*) nounwind readonly
+
+%struct.__neon_int16x4x2_t = type { <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x3_t = type { <4 x i16>,  <4 x i16>,  <4 x i16> }
+%struct.__neon_int16x4x4_t = type { <4 x i16>,  <4 x i16>, <4 x i16>,  <4 x i16> }
+
+define %struct.__neon_int16x4x2_t @ld2_4h(i16* %A) nounwind {
+; CHECK: ld2_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.4h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x2_t  %tmp2
+}
+
+define %struct.__neon_int16x4x3_t @ld3_4h(i16* %A) nounwind {
+; CHECK: ld3_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.4h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x3_t  %tmp2
+}
+
+define %struct.__neon_int16x4x4_t @ld4_4h(i16* %A) nounwind {
+; CHECK: ld4_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.4h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4.v4i16.p0i16(i16*) nounwind readonly
+
+%struct.__neon_int16x8x2_t = type { <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x3_t = type { <8 x i16>,  <8 x i16>,  <8 x i16> }
+%struct.__neon_int16x8x4_t = type { <8 x i16>,  <8 x i16>, <8 x i16>,  <8 x i16> }
+
+define %struct.__neon_int16x8x2_t @ld2_8h(i16* %A) nounwind {
+; CHECK: ld2_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.8h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3_8h(i16* %A) nounwind {
+; CHECK: ld3_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.8h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x3_t %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4_8h(i16* %A) nounwind {
+; CHECK: ld4_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.8h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4.v8i16.p0i16(i16*) nounwind readonly
+
+%struct.__neon_int32x2x2_t = type { <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x3_t = type { <2 x i32>,  <2 x i32>,  <2 x i32> }
+%struct.__neon_int32x2x4_t = type { <2 x i32>,  <2 x i32>, <2 x i32>,  <2 x i32> }
+
+define %struct.__neon_int32x2x2_t @ld2_2s(i32* %A) nounwind {
+; CHECK: ld2_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.2s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x2_t  %tmp2
+}
+
+define %struct.__neon_int32x2x3_t @ld3_2s(i32* %A) nounwind {
+; CHECK: ld3_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.2s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x3_t  %tmp2
+}
+
+define %struct.__neon_int32x2x4_t @ld4_2s(i32* %A) nounwind {
+; CHECK: ld4_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.2s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4.v2i32.p0i32(i32*) nounwind readonly
+
+%struct.__neon_int32x4x2_t = type { <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x3_t = type { <4 x i32>,  <4 x i32>,  <4 x i32> }
+%struct.__neon_int32x4x4_t = type { <4 x i32>,  <4 x i32>, <4 x i32>,  <4 x i32> }
+
+define %struct.__neon_int32x4x2_t @ld2_4s(i32* %A) nounwind {
+; CHECK: ld2_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.4s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3_4s(i32* %A) nounwind {
+; CHECK: ld3_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.4s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4_4s(i32* %A) nounwind {
+; CHECK: ld4_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.4s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4.v4i32.p0i32(i32*) nounwind readonly
+
+%struct.__neon_int64x2x2_t = type { <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x3_t = type { <2 x i64>,  <2 x i64>,  <2 x i64> }
+%struct.__neon_int64x2x4_t = type { <2 x i64>,  <2 x i64>, <2 x i64>,  <2 x i64> }
+
+define %struct.__neon_int64x2x2_t @ld2_2d(i64* %A) nounwind {
+; CHECK: ld2_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2.2d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3_2d(i64* %A) nounwind {
+; CHECK: ld3_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3.2d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4_2d(i64* %A) nounwind {
+; CHECK: ld4_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4.2d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4.v2i64.p0i64(i64*) nounwind readonly
+
+%struct.__neon_int64x1x2_t = type { <1 x i64>,  <1 x i64> }
+%struct.__neon_int64x1x3_t = type { <1 x i64>,  <1 x i64>, <1 x i64> }
+%struct.__neon_int64x1x4_t = type { <1 x i64>,  <1 x i64>, <1 x i64>, <1 x i64> }
+
+
+define %struct.__neon_int64x1x2_t @ld2_1di64(i64* %A) nounwind {
+; CHECK: ld2_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x2_t  %tmp2
+}
+
+define %struct.__neon_int64x1x3_t @ld3_1di64(i64* %A) nounwind {
+; CHECK: ld3_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x3_t  %tmp2
+}
+
+define %struct.__neon_int64x1x4_t @ld4_1di64(i64* %A) nounwind {
+; CHECK: ld4_1di64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x1x4_t  %tmp2
+}
+
+
+declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld4.v1i64.p0i64(i64*) nounwind readonly
+
+%struct.__neon_float64x1x2_t = type { <1 x double>,  <1 x double> }
+%struct.__neon_float64x1x3_t = type { <1 x double>,  <1 x double>, <1 x double> }
+%struct.__neon_float64x1x4_t = type { <1 x double>,  <1 x double>, <1 x double>, <1 x double> }
+
+
+define %struct.__neon_float64x1x2_t @ld2_1df64(double* %A) nounwind {
+; CHECK: ld2_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x2_t  %tmp2
+}
+
+define %struct.__neon_float64x1x3_t @ld3_1df64(double* %A) nounwind {
+; CHECK: ld3_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x3_t  %tmp2
+}
+
+define %struct.__neon_float64x1x4_t @ld4_1df64(double* %A) nounwind {
+; CHECK: ld4_1df64
+; Make sure we are using the operands defined by the ABI
+; CHECK ld1.1d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double* %A)
+	ret %struct.__neon_float64x1x4_t  %tmp2
+}
+
+declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld2.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld3.v1f64.p0f64(double*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld4.v1f64.p0f64(double*) nounwind readonly
+
+
+define %struct.__neon_int8x16x2_t @ld2lane_16b(<16 x i8> %L1, <16 x i8> %L2, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_16b
+; CHECK ld2.b { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_16b
+; CHECK ld3.b { v0, v1, v2 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4lane_16b(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i8* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_16b
+; CHECK ld4.b { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8> %L1, <16 x i8> %L2, <16 x i8> %L3, <16 x i8> %L4, i64 1, i8* %A)
+	ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2lane_8h(<8 x i16> %L1, <8 x i16> %L2, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_8h
+; CHECK ld2.h { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_8h
+; CHECK ld3.h { v0, v1, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x3_t  %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4lane_8h(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i16* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_8h
+; CHECK ld4.h { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16> %L1, <8 x i16> %L2, <8 x i16> %L3, <8 x i16> %L4, i64 1, i16* %A)
+	ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2lane_4s(<4 x i32> %L1, <4 x i32> %L2, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_4s
+; CHECK ld2.s { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_4s
+; CHECK ld3.s { v0, v1, v2 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4lane_4s(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i32* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_4s
+; CHECK ld4.s { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32> %L1, <4 x i32> %L2, <4 x i32> %L3, <4 x i32> %L4, i64 1, i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2lane_2d(<2 x i64> %L1, <2 x i64> %L2, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld2lane_2d
+; CHECK ld2.d { v0, v1 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld3lane_2d
+; CHECK ld3.d { v0, v1, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4lane_2d(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64* %A) nounwind {
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld4lane_2d
+; CHECK ld4.d { v0, v1, v2, v3 }[1], [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64> %L1, <2 x i64> %L2, <2 x i64> %L3, <2 x i64> %L4, i64 1, i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readonly
+
+define <8 x i8> @ld1r_8b(i8* %bar) {
+; CHECK: ld1r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.8b { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i8> %tmp8, i8 %tmp1, i32 7
+  ret <8 x i8> %tmp9
+}
+
+define <16 x i8> @ld1r_16b(i8* %bar) {
+; CHECK: ld1r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.16b { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
+  %tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
+  %tmp4 = insertelement <16 x i8> %tmp3, i8 %tmp1, i32 2
+  %tmp5 = insertelement <16 x i8> %tmp4, i8 %tmp1, i32 3
+  %tmp6 = insertelement <16 x i8> %tmp5, i8 %tmp1, i32 4
+  %tmp7 = insertelement <16 x i8> %tmp6, i8 %tmp1, i32 5
+  %tmp8 = insertelement <16 x i8> %tmp7, i8 %tmp1, i32 6
+  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp1, i32 7
+  %tmp10 = insertelement <16 x i8> %tmp9, i8 %tmp1, i32 8
+  %tmp11 = insertelement <16 x i8> %tmp10, i8 %tmp1, i32 9
+  %tmp12 = insertelement <16 x i8> %tmp11, i8 %tmp1, i32 10
+  %tmp13 = insertelement <16 x i8> %tmp12, i8 %tmp1, i32 11
+  %tmp14 = insertelement <16 x i8> %tmp13, i8 %tmp1, i32 12
+  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp1, i32 13
+  %tmp16 = insertelement <16 x i8> %tmp15, i8 %tmp1, i32 14
+  %tmp17 = insertelement <16 x i8> %tmp16, i8 %tmp1, i32 15
+  ret <16 x i8> %tmp17
+}
+
+define <4 x i16> @ld1r_4h(i16* %bar) {
+; CHECK: ld1r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4h { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <4 x i16> <i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i16> %tmp4, i16 %tmp1, i32 3
+  ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @ld1r_8h(i16* %bar) {
+; CHECK: ld1r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.8h { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, i16 %tmp1, i32 0
+  %tmp3 = insertelement <8 x i16> %tmp2, i16 %tmp1, i32 1
+  %tmp4 = insertelement <8 x i16> %tmp3, i16 %tmp1, i32 2
+  %tmp5 = insertelement <8 x i16> %tmp4, i16 %tmp1, i32 3
+  %tmp6 = insertelement <8 x i16> %tmp5, i16 %tmp1, i32 4
+  %tmp7 = insertelement <8 x i16> %tmp6, i16 %tmp1, i32 5
+  %tmp8 = insertelement <8 x i16> %tmp7, i16 %tmp1, i32 6
+  %tmp9 = insertelement <8 x i16> %tmp8, i16 %tmp1, i32 7
+  ret <8 x i16> %tmp9
+}
+
+define <2 x i32> @ld1r_2s(i32* %bar) {
+; CHECK: ld1r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <2 x i32> <i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i32> %tmp2, i32 %tmp1, i32 1
+  ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @ld1r_4s(i32* %bar) {
+; CHECK: ld1r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> <i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp1, i32 0
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
+  %tmp4 = insertelement <4 x i32> %tmp3, i32 %tmp1, i32 2
+  %tmp5 = insertelement <4 x i32> %tmp4, i32 %tmp1, i32 3
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ld1r_2d(i64* %bar) {
+; CHECK: ld1r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> <i64 undef, i64 undef>, i64 %tmp1, i32 0
+  %tmp3 = insertelement <2 x i64> %tmp2, i64 %tmp1, i32 1
+  ret <2 x i64> %tmp3
+}
+
+define %struct.__neon_int8x8x2_t @ld2r_8b(i8* %A) nounwind {
+; CHECK: ld2r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.8b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x2_t  %tmp2
+}
+
+define %struct.__neon_int8x8x3_t @ld3r_8b(i8* %A) nounwind {
+; CHECK: ld3r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.8b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x3_t  %tmp2
+}
+
+define %struct.__neon_int8x8x4_t @ld4r_8b(i8* %A) nounwind {
+; CHECK: ld4r_8b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.8b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld2r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld3r.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld4r.v8i8.p0i8(i8*) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld2r_16b(i8* %A) nounwind {
+; CHECK: ld2r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.16b { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x2_t  %tmp2
+}
+
+define %struct.__neon_int8x16x3_t @ld3r_16b(i8* %A) nounwind {
+; CHECK: ld3r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.16b { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x3_t  %tmp2
+}
+
+define %struct.__neon_int8x16x4_t @ld4r_16b(i8* %A) nounwind {
+; CHECK: ld4r_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.16b { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8* %A)
+	ret %struct.__neon_int8x16x4_t  %tmp2
+}
+
+declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld2r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld3r.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld4r.v16i8.p0i8(i8*) nounwind readonly
+
+define %struct.__neon_int16x4x2_t @ld2r_4h(i16* %A) nounwind {
+; CHECK: ld2r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.4h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x2_t  %tmp2
+}
+
+define %struct.__neon_int16x4x3_t @ld3r_4h(i16* %A) nounwind {
+; CHECK: ld3r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.4h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x3_t  %tmp2
+}
+
+define %struct.__neon_int16x4x4_t @ld4r_4h(i16* %A) nounwind {
+; CHECK: ld4r_4h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.4h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16* %A)
+	ret %struct.__neon_int16x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld2r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld3r.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld4r.v4i16.p0i16(i16*) nounwind readonly
+
+define %struct.__neon_int16x8x2_t @ld2r_8h(i16* %A) nounwind {
+; CHECK: ld2r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.8h { v0, v1 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x2_t  %tmp2
+}
+
+define %struct.__neon_int16x8x3_t @ld3r_8h(i16* %A) nounwind {
+; CHECK: ld3r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.8h { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x3_t  %tmp2
+}
+
+define %struct.__neon_int16x8x4_t @ld4r_8h(i16* %A) nounwind {
+; CHECK: ld4r_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.8h { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+  %tmp2 = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16* %A)
+  ret %struct.__neon_int16x8x4_t  %tmp2
+}
+
+declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld2r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld3r.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld4r.v8i16.p0i16(i16*) nounwind readonly
+
+define %struct.__neon_int32x2x2_t @ld2r_2s(i32* %A) nounwind {
+; CHECK: ld2r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.2s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x2_t  %tmp2
+}
+
+define %struct.__neon_int32x2x3_t @ld3r_2s(i32* %A) nounwind {
+; CHECK: ld3r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.2s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x3_t  %tmp2
+}
+
+define %struct.__neon_int32x2x4_t @ld4r_2s(i32* %A) nounwind {
+; CHECK: ld4r_2s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.2s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld2r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld3r.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld4r.v2i32.p0i32(i32*) nounwind readonly
+
+define %struct.__neon_int32x4x2_t @ld2r_4s(i32* %A) nounwind {
+; CHECK: ld2r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.4s { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x2_t  %tmp2
+}
+
+define %struct.__neon_int32x4x3_t @ld3r_4s(i32* %A) nounwind {
+; CHECK: ld3r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.4s { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x3_t  %tmp2
+}
+
+define %struct.__neon_int32x4x4_t @ld4r_4s(i32* %A) nounwind {
+; CHECK: ld4r_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.4s { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32* %A)
+	ret %struct.__neon_int32x4x4_t  %tmp2
+}
+
+declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld2r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld3r.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld4r.v4i32.p0i32(i32*) nounwind readonly
+
+define %struct.__neon_int64x2x2_t @ld2r_2d(i64* %A) nounwind {
+; CHECK: ld2r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld2r.2d { v0, v1 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x2_t  %tmp2
+}
+
+define %struct.__neon_int64x2x3_t @ld3r_2d(i64* %A) nounwind {
+; CHECK: ld3r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld3r.2d { v0, v1, v2 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x3_t  %tmp2
+}
+
+define %struct.__neon_int64x2x4_t @ld4r_2d(i64* %A) nounwind {
+; CHECK: ld4r_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK ld4r.2d { v0, v1, v2, v3 }, [x0]
+; CHECK-NEXT ret
+	%tmp2 = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64* %A)
+	ret %struct.__neon_int64x2x4_t  %tmp2
+}
+
+declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld2r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld3r.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld4r.v2i64.p0i64(i64*) nounwind readonly
+
+define <16 x i8> @ld1_16b(<16 x i8> %V, i8* %bar) {
+; CHECK: ld1_16b
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.b { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i8* %bar
+  %tmp2 = insertelement <16 x i8> %V, i8 %tmp1, i32 0
+  ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @ld1_8h(<8 x i16> %V, i16* %bar) {
+; CHECK: ld1_8h
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.h { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i16* %bar
+  %tmp2 = insertelement <8 x i16> %V, i16 %tmp1, i32 0
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @ld1_4s(<4 x i32> %V, i32* %bar) {
+; CHECK: ld1_4s
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.s { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i32* %bar
+  %tmp2 = insertelement <4 x i32> %V, i32 %tmp1, i32 0
+  ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @ld1_2d(<2 x i64> %V, i64* %bar) {
+; CHECK: ld1_2d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1.d { v0 }[0], [x0]
+; CHECK-NEXT ret
+  %tmp1 = load i64* %bar
+  %tmp2 = insertelement <2 x i64> %V, i64 %tmp1, i32 0
+  ret <2 x i64> %tmp2
+}
+
+define <1 x i64> @ld1_1d(<1 x i64>* %p) {
+; CHECK: ld1_1d
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr [[REG:d[0-9]+]], [x0]
+; CHECK-NEXT: ret
+  %tmp = load <1 x i64>* %p, align 8
+  ret <1 x i64> %tmp
+}
+
+
+; Add rdar://13098923 test case: vld1_dup_u32 doesn't generate ld1r.2s
+define void @ld1r_2s_from_dup(i8* nocapture %a, i8* nocapture %b, i16* nocapture %diff) nounwind ssp {
+entry:
+; CHECK: ld1r_2s_from_dup
+; CHECK: ld1r.2s { [[ARG1:v[0-9]+]] }, [x0]
+; CHECK-NEXT: ld1r.2s { [[ARG2:v[0-9]+]] }, [x1]
+; CHECK-NEXT: usubl.8h v[[RESREGNUM:[0-9]+]], [[ARG1]], [[ARG2]]
+; CHECK-NEXT: str d[[RESREGNUM]], [x2]
+; CHECK-NEXT: ret
+  %tmp = bitcast i8* %a to i32*
+  %tmp1 = load i32* %tmp, align 4
+  %tmp2 = insertelement <2 x i32> undef, i32 %tmp1, i32 0
+  %lane = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmp3 = bitcast <2 x i32> %lane to <8 x i8>
+  %tmp4 = bitcast i8* %b to i32*
+  %tmp5 = load i32* %tmp4, align 4
+  %tmp6 = insertelement <2 x i32> undef, i32 %tmp5, i32 0
+  %lane1 = shufflevector <2 x i32> %tmp6, <2 x i32> undef, <2 x i32> zeroinitializer
+  %tmp7 = bitcast <2 x i32> %lane1 to <8 x i8>
+  %vmovl.i.i = zext <8 x i8> %tmp3 to <8 x i16>
+  %vmovl.i4.i = zext <8 x i8> %tmp7 to <8 x i16>
+  %sub.i = sub <8 x i16> %vmovl.i.i, %vmovl.i4.i
+  %tmp8 = bitcast <8 x i16> %sub.i to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %tmp8, <2 x i64> undef, <1 x i32> zeroinitializer
+  %tmp9 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %tmp10 = bitcast i16* %diff to <4 x i16>*
+  store <4 x i16> %tmp9, <4 x i16>* %tmp10, align 8
+  ret void
+}
+
+; Tests for rdar://11947069: vld1_dup_* and vld1q_dup_* code gen is suboptimal
+define <4 x float> @ld1r_4s_float(float* nocapture %x) {
+entry:
+; CHECK: ld1r_4s_float
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
+  %tmp2 = insertelement <4 x float> %tmp1, float %tmp, i32 1
+  %tmp3 = insertelement <4 x float> %tmp2, float %tmp, i32 2
+  %tmp4 = insertelement <4 x float> %tmp3, float %tmp, i32 3
+  ret <4 x float> %tmp4
+}
+
+define <2 x float> @ld1r_2s_float(float* nocapture %x) {
+entry:
+; CHECK: ld1r_2s_float
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
+  %tmp2 = insertelement <2 x float> %tmp1, float %tmp, i32 1
+  ret <2 x float> %tmp2
+}
+
+define <2 x double> @ld1r_2d_double(double* nocapture %x) {
+entry:
+; CHECK: ld1r_2d_double
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
+  %tmp2 = insertelement <2 x double> %tmp1, double %tmp, i32 1
+  ret <2 x double> %tmp2
+}
+
+define <1 x double> @ld1r_1d_double(double* nocapture %x) {
+entry:
+; CHECK: ld1r_1d_double
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
+  ret <1 x double> %tmp1
+}
+
+define <4 x float> @ld1r_4s_float_shuff(float* nocapture %x) {
+entry:
+; CHECK: ld1r_4s_float_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.4s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <4 x float> undef, float %tmp, i32 0
+  %lane = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer
+  ret <4 x float> %lane
+}
+
+define <2 x float> @ld1r_2s_float_shuff(float* nocapture %x) {
+entry:
+; CHECK: ld1r_2s_float_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2s { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load float* %x, align 4
+  %tmp1 = insertelement <2 x float> undef, float %tmp, i32 0
+  %lane = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> zeroinitializer
+  ret <2 x float> %lane
+}
+
+define <2 x double> @ld1r_2d_double_shuff(double* nocapture %x) {
+entry:
+; CHECK: ld1r_2d_double_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ld1r.2d { v0 }, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <2 x double> undef, double %tmp, i32 0
+  %lane = shufflevector <2 x double> %tmp1, <2 x double> undef, <2 x i32> zeroinitializer
+  ret <2 x double> %lane
+}
+
+define <1 x double> @ld1r_1d_double_shuff(double* nocapture %x) {
+entry:
+; CHECK: ld1r_1d_double_shuff
+; Make sure we are using the operands defined by the ABI
+; CHECK: ldr d0, [x0]
+; CHECK-NEXT ret
+  %tmp = load double* %x, align 4
+  %tmp1 = insertelement <1 x double> undef, double %tmp, i32 0
+  %lane = shufflevector <1 x double> %tmp1, <1 x double> undef, <1 x i32> zeroinitializer
+  ret <1 x double> %lane
+}
+
+%struct.__neon_float32x2x2_t = type { <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x3_t = type { <2 x float>,  <2 x float>,  <2 x float> }
+%struct.__neon_float32x2x4_t = type { <2 x float>,  <2 x float>, <2 x float>,  <2 x float> }
+
+declare %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x2_t @ld1_x2_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x2_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x2_t @llvm.arm64.neon.ld1x2.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x2_t %val
+}
+
+define %struct.__neon_int16x4x2_t @ld1_x2_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x2_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x2_t @llvm.arm64.neon.ld1x2.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x2_t %val
+}
+
+define %struct.__neon_int32x2x2_t @ld1_x2_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x2_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x2_t @llvm.arm64.neon.ld1x2.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x2_t %val
+}
+
+define %struct.__neon_float32x2x2_t @ld1_x2_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x2_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x2_t @llvm.arm64.neon.ld1x2.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x2_t %val
+}
+
+define %struct.__neon_int64x1x2_t @ld1_x2_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x2_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x2_t @llvm.arm64.neon.ld1x2.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x2_t %val
+}
+
+define %struct.__neon_float64x1x2_t @ld1_x2_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x2_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x2_t @llvm.arm64.neon.ld1x2.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x2_t %val
+}
+
+
+%struct.__neon_float32x4x2_t = type { <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x3_t = type { <4 x float>,  <4 x float>,  <4 x float> }
+%struct.__neon_float32x4x4_t = type { <4 x float>,  <4 x float>, <4 x float>,  <4 x float> }
+
+%struct.__neon_float64x2x2_t = type { <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x3_t = type { <2 x double>,  <2 x double>,  <2 x double> }
+%struct.__neon_float64x2x4_t = type { <2 x double>,  <2 x double>, <2 x double>,  <2 x double> }
+
+declare %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x2_t @ld1_x2_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x2_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x2_t @llvm.arm64.neon.ld1x2.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x2_t %val
+}
+
+define %struct.__neon_int16x8x2_t @ld1_x2_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x2_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x2_t @llvm.arm64.neon.ld1x2.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x2_t %val
+}
+
+define %struct.__neon_int32x4x2_t @ld1_x2_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x2_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x2_t @llvm.arm64.neon.ld1x2.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x2_t %val
+}
+
+define %struct.__neon_float32x4x2_t @ld1_x2_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x2_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x2_t @llvm.arm64.neon.ld1x2.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x2_t %val
+}
+
+define %struct.__neon_int64x2x2_t @ld1_x2_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x2_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x2_t @llvm.arm64.neon.ld1x2.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x2_t %val
+}
+
+define %struct.__neon_float64x2x2_t @ld1_x2_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x2_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x2_t @llvm.arm64.neon.ld1x2.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x2_t %val
+}
+
+declare %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x3_t @ld1_x3_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x3_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x3_t @llvm.arm64.neon.ld1x3.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x3_t %val
+}
+
+define %struct.__neon_int16x4x3_t @ld1_x3_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x3_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x3_t @llvm.arm64.neon.ld1x3.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x3_t %val
+}
+
+define %struct.__neon_int32x2x3_t @ld1_x3_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x3_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x3_t @llvm.arm64.neon.ld1x3.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x3_t %val
+}
+
+define %struct.__neon_float32x2x3_t @ld1_x3_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x3_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x3_t @llvm.arm64.neon.ld1x3.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x3_t %val
+}
+
+define %struct.__neon_int64x1x3_t @ld1_x3_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x3_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x3_t @llvm.arm64.neon.ld1x3.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x3_t %val
+}
+
+define %struct.__neon_float64x1x3_t @ld1_x3_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x3_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x3_t @llvm.arm64.neon.ld1x3.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x3_t %val
+}
+
+declare %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x3_t @ld1_x3_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x3_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x3_t @llvm.arm64.neon.ld1x3.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x3_t %val
+}
+
+define %struct.__neon_int16x8x3_t @ld1_x3_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x3_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x3_t @llvm.arm64.neon.ld1x3.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x3_t %val
+}
+
+define %struct.__neon_int32x4x3_t @ld1_x3_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x3_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x3_t @llvm.arm64.neon.ld1x3.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x3_t %val
+}
+
+define %struct.__neon_float32x4x3_t @ld1_x3_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x3_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x3_t @llvm.arm64.neon.ld1x3.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x3_t %val
+}
+
+define %struct.__neon_int64x2x3_t @ld1_x3_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x3_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x3_t @llvm.arm64.neon.ld1x3.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x3_t %val
+}
+
+define %struct.__neon_float64x2x3_t @ld1_x3_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x3_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x3_t @llvm.arm64.neon.ld1x3.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x3_t %val
+}
+
+declare %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x8x4_t @ld1_x4_v8i8(i8* %addr) {
+; CHECK-LABEL: ld1_x4_v8i8:
+; CHECK: ld1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x8x4_t @llvm.arm64.neon.ld1x4.v8i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x8x4_t %val
+}
+
+define %struct.__neon_int16x4x4_t @ld1_x4_v4i16(i16* %addr) {
+; CHECK-LABEL: ld1_x4_v4i16:
+; CHECK: ld1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x4x4_t @llvm.arm64.neon.ld1x4.v4i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x4x4_t %val
+}
+
+define %struct.__neon_int32x2x4_t @ld1_x4_v2i32(i32* %addr) {
+; CHECK-LABEL: ld1_x4_v2i32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x2x4_t @llvm.arm64.neon.ld1x4.v2i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x2x4_t %val
+}
+
+define %struct.__neon_float32x2x4_t @ld1_x4_v2f32(float* %addr) {
+; CHECK-LABEL: ld1_x4_v2f32:
+; CHECK: ld1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x2x4_t @llvm.arm64.neon.ld1x4.v2f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x2x4_t %val
+}
+
+define %struct.__neon_int64x1x4_t @ld1_x4_v1i64(i64* %addr) {
+; CHECK-LABEL: ld1_x4_v1i64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x1x4_t @llvm.arm64.neon.ld1x4.v1i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x1x4_t %val
+}
+
+define %struct.__neon_float64x1x4_t @ld1_x4_v1f64(double* %addr) {
+; CHECK-LABEL: ld1_x4_v1f64:
+; CHECK: ld1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x1x4_t @llvm.arm64.neon.ld1x4.v1f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x1x4_t %val
+}
+
+declare %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8*) nounwind readonly
+declare %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16*) nounwind readonly
+declare %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32*) nounwind readonly
+declare %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float*) nounwind readonly
+declare %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64*) nounwind readonly
+declare %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double*) nounwind readonly
+
+define %struct.__neon_int8x16x4_t @ld1_x4_v16i8(i8* %addr) {
+; CHECK-LABEL: ld1_x4_v16i8:
+; CHECK: ld1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int8x16x4_t @llvm.arm64.neon.ld1x4.v16i8.p0i8(i8* %addr)
+  ret %struct.__neon_int8x16x4_t %val
+}
+
+define %struct.__neon_int16x8x4_t @ld1_x4_v8i16(i16* %addr) {
+; CHECK-LABEL: ld1_x4_v8i16:
+; CHECK: ld1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int16x8x4_t @llvm.arm64.neon.ld1x4.v8i16.p0i16(i16* %addr)
+  ret %struct.__neon_int16x8x4_t %val
+}
+
+define %struct.__neon_int32x4x4_t @ld1_x4_v4i32(i32* %addr) {
+; CHECK-LABEL: ld1_x4_v4i32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int32x4x4_t @llvm.arm64.neon.ld1x4.v4i32.p0i32(i32* %addr)
+  ret %struct.__neon_int32x4x4_t %val
+}
+
+define %struct.__neon_float32x4x4_t @ld1_x4_v4f32(float* %addr) {
+; CHECK-LABEL: ld1_x4_v4f32:
+; CHECK: ld1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float32x4x4_t @llvm.arm64.neon.ld1x4.v4f32.p0f32(float* %addr)
+  ret %struct.__neon_float32x4x4_t %val
+}
+
+define %struct.__neon_int64x2x4_t @ld1_x4_v2i64(i64* %addr) {
+; CHECK-LABEL: ld1_x4_v2i64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_int64x2x4_t @llvm.arm64.neon.ld1x4.v2i64.p0i64(i64* %addr)
+  ret %struct.__neon_int64x2x4_t %val
+}
+
+define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(double* %addr) {
+; CHECK-LABEL: ld1_x4_v2f64:
+; CHECK: ld1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  %val = call %struct.__neon_float64x2x4_t @llvm.arm64.neon.ld1x4.v2f64.p0f64(double* %addr)
+  ret %struct.__neon_float64x2x4_t %val
+}
diff --git a/test/CodeGen/ARM64/ldp.ll b/test/CodeGen/ARM64/ldp.ll
new file mode 100644
index 0000000000..9444385f8a
--- /dev/null
+++ b/test/CodeGen/ARM64/ldp.ll
@@ -0,0 +1,149 @@
+; RUN: llc < %s -march=arm64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
+; RUN:   -verify-machineinstrs | FileCheck -check-prefix=LDUR_CHK %s
+
+; CHECK: ldp_int
+; CHECK: ldp
+define i32 @ldp_int(i32* %p) nounwind {
+  %tmp = load i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32* %p, i64 1
+  %tmp1 = load i32* %add.ptr, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  ret i32 %add
+}
+
+; CHECK: ldp_long
+; CHECK: ldp
+define i64 @ldp_long(i64* %p) nounwind {
+  %tmp = load i64* %p, align 8
+  %add.ptr = getelementptr inbounds i64* %p, i64 1
+  %tmp1 = load i64* %add.ptr, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  ret i64 %add
+}
+
+; CHECK: ldp_float
+; CHECK: ldp
+define float @ldp_float(float* %p) nounwind {
+  %tmp = load float* %p, align 4
+  %add.ptr = getelementptr inbounds float* %p, i64 1
+  %tmp1 = load float* %add.ptr, align 4
+  %add = fadd float %tmp, %tmp1
+  ret float %add
+}
+
+; CHECK: ldp_double
+; CHECK: ldp
+define double @ldp_double(double* %p) nounwind {
+  %tmp = load double* %p, align 8
+  %add.ptr = getelementptr inbounds double* %p, i64 1
+  %tmp1 = load double* %add.ptr, align 8
+  %add = fadd double %tmp, %tmp1
+  ret double %add
+}
+
+; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
+define i32 @ldur_int(i32* %a) nounwind {
+; LDUR_CHK: ldur_int
+; LDUR_CHK: ldp     [[DST1:w[0-9]+]], [[DST2:w[0-9]+]], [x0, #-8]
+; LDUR_CHK-NEXT: add     w{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %a, i32 -1
+  %tmp1 = load i32* %p1, align 2
+  %p2 = getelementptr inbounds i32* %a, i32 -2
+  %tmp2 = load i32* %p2, align 2
+  %tmp3 = add i32 %tmp1, %tmp2
+  ret i32 %tmp3
+}
+
+define i64 @ldur_long(i64* %a) nounwind ssp {
+; LDUR_CHK: ldur_long
+; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-16]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -1
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -2
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define float @ldur_float(float* %a) {
+; LDUR_CHK: ldur_float
+; LDUR_CHK: ldp     [[DST1:s[0-9]+]], [[DST2:s[0-9]+]], [x0, #-8]
+; LDUR_CHK-NEXT: add     s{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds float* %a, i64 -1
+  %tmp1 = load float* %p1, align 2
+  %p2 = getelementptr inbounds float* %a, i64 -2
+  %tmp2 = load float* %p2, align 2
+  %tmp3 = fadd float %tmp1, %tmp2
+  ret float %tmp3
+}
+
+define double @ldur_double(double* %a) {
+; LDUR_CHK: ldur_double
+; LDUR_CHK: ldp     [[DST1:d[0-9]+]], [[DST2:d[0-9]+]], [x0, #-16]
+; LDUR_CHK-NEXT: add     d{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds double* %a, i64 -1
+  %tmp1 = load double* %p1, align 2
+  %p2 = getelementptr inbounds double* %a, i64 -2
+  %tmp2 = load double* %p2, align 2
+  %tmp3 = fadd double %tmp1, %tmp2
+  ret double %tmp3
+}
+
+; Now check some boundary conditions
+define i64 @pairUpBarelyIn(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyIn
+; LDUR_CHK-NOT: ldur
+; LDUR_CHK: ldp     [[DST1:x[0-9]+]], [[DST2:x[0-9]+]], [x0, #-256]
+; LDUR_CHK-NEXT: add     x{{[0-9]+}}, [[DST2]], [[DST1]]
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -31
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -32
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @pairUpBarelyOut(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpBarelyOut
+; LDUR_CHK-NOT: ldp
+; Don't be fragile about which loads or manipulations of the base register
+; are used---just check that there isn't an ldp before the add
+; LDUR_CHK: add
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -32
+  %tmp1 = load i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %a, i64 -33
+  %tmp2 = load i64* %p2, align 2
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @pairUpNotAligned(i64* %a) nounwind ssp {
+; LDUR_CHK: pairUpNotAligned
+; LDUR_CHK-NOT: ldp
+; LDUR_CHK: ldur
+; LDUR_CHK-NEXT: ldur
+; LDUR_CHK-NEXT: add
+; LDUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %a, i64 -18
+  %bp1 = bitcast i64* %p1 to i8*
+  %bp1p1 = getelementptr inbounds i8* %bp1, i64 1
+  %dp1 = bitcast i8* %bp1p1 to i64*
+  %tmp1 = load i64* %dp1, align 1
+
+  %p2 = getelementptr inbounds i64* %a, i64 -17
+  %bp2 = bitcast i64* %p2 to i8*
+  %bp2p1 = getelementptr inbounds i8* %bp2, i64 1
+  %dp2 = bitcast i8* %bp2p1 to i64*
+  %tmp2 = load i64* %dp2, align 1
+
+  %tmp3 = add i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
diff --git a/test/CodeGen/ARM64/ldur.ll b/test/CodeGen/ARM64/ldur.ll
new file mode 100644
index 0000000000..2848c06f9b
--- /dev/null
+++ b/test/CodeGen/ARM64/ldur.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i64 @_f0(i64* %p) {
+; CHECK: f0:
+; CHECK: ldur x0, [x0, #-8]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i64* %p, i64 -1
+  %ret = load i64* %tmp, align 2
+  ret i64 %ret
+}
+define i32 @_f1(i32* %p) {
+; CHECK: f1:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i32* %p, i64 -1
+  %ret = load i32* %tmp, align 2
+  ret i32 %ret
+}
+define i16 @_f2(i16* %p) {
+; CHECK: f2:
+; CHECK: ldurh w0, [x0, #-2]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i16* %p, i64 -1
+  %ret = load i16* %tmp, align 2
+  ret i16 %ret
+}
+define i8 @_f3(i8* %p) {
+; CHECK: f3:
+; CHECK: ldurb w0, [x0, #-1]
+; CHECK-NEXT: ret
+  %tmp = getelementptr inbounds i8* %p, i64 -1
+  %ret = load i8* %tmp, align 2
+  ret i8 %ret
+}
+
+define i64 @zext32(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext32:
+; CHECK: ldur w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp1 = bitcast i8* %p to i32*
+  %tmp2 = load i32* %tmp1, align 4
+  %ret = zext i32 %tmp2 to i64
+
+  ret i64 %ret
+}
+define i64 @zext16(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext16:
+; CHECK: ldurh w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp1 = bitcast i8* %p to i16*
+  %tmp2 = load i16* %tmp1, align 2
+  %ret = zext i16 %tmp2 to i64
+
+  ret i64 %ret
+}
+define i64 @zext8(i8* %a) nounwind ssp {
+; CHECK-LABEL: zext8:
+; CHECK: ldurb w0, [x0, #-12]
+; CHECK-NEXT: ret
+  %p = getelementptr inbounds i8* %a, i64 -12
+  %tmp2 = load i8* %p, align 1
+  %ret = zext i8 %tmp2 to i64
+
+  ret i64 %ret
+}
diff --git a/test/CodeGen/ARM64/ldxr-stxr.ll b/test/CodeGen/ARM64/ldxr-stxr.ll
new file mode 100644
index 0000000000..d50ba949b1
--- /dev/null
+++ b/test/CodeGen/ARM64/ldxr-stxr.ll
@@ -0,0 +1,143 @@
+; RUN: llc < %s -mtriple=arm64-linux-gnu | FileCheck %s
+
+%0 = type { i64, i64 }
+
+define i128 @f0(i8* %p) nounwind readonly {
+; CHECK-LABEL: f0:
+; CHECK: ldxp {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %ldrexd = tail call %0 @llvm.arm64.ldxp(i8* %p)
+  %0 = extractvalue %0 %ldrexd, 1
+  %1 = extractvalue %0 %ldrexd, 0
+  %2 = zext i64 %0 to i128
+  %3 = zext i64 %1 to i128
+  %shl = shl nuw i128 %2, 64
+  %4 = or i128 %shl, %3
+  ret i128 %4
+}
+
+define i32 @f1(i8* %ptr, i128 %val) nounwind {
+; CHECK-LABEL: f1:
+; CHECK: stxp {{w[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}, [x0]
+entry:
+  %tmp4 = trunc i128 %val to i64
+  %tmp6 = lshr i128 %val, 64
+  %tmp7 = trunc i128 %tmp6 to i64
+  %strexd = tail call i32 @llvm.arm64.stxp(i64 %tmp4, i64 %tmp7, i8* %ptr)
+  ret i32 %strexd
+}
+
+declare %0 @llvm.arm64.ldxp(i8*) nounwind
+declare i32 @llvm.arm64.stxp(i64, i64, i8*) nounwind
+
+@var = global i64 0, align 8
+
+define void @test_load_i8(i8* %addr) {
+; CHECK-LABEL: test_load_i8:
+; CHECK: ldxrb w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.arm64.ldxr.p0i8(i8* %addr)
+  %shortval = trunc i64 %val to i8
+  %extval = zext i8 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i16(i16* %addr) {
+; CHECK-LABEL: test_load_i16:
+; CHECK: ldxrh w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.arm64.ldxr.p0i16(i16* %addr)
+  %shortval = trunc i64 %val to i16
+  %extval = zext i16 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i32(i32* %addr) {
+; CHECK-LABEL: test_load_i32:
+; CHECK: ldxr w[[LOADVAL:[0-9]+]], [x0]
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.arm64.ldxr.p0i32(i32* %addr)
+  %shortval = trunc i64 %val to i32
+  %extval = zext i32 %shortval to i64
+  store i64 %extval, i64* @var, align 8
+  ret void
+}
+
+define void @test_load_i64(i64* %addr) {
+; CHECK-LABEL: test_load_i64:
+; CHECK: ldxr x[[LOADVAL:[0-9]+]], [x0]
+; CHECK: str x[[LOADVAL]], [{{x[0-9]+}}, :lo12:var]
+
+  %val = call i64 @llvm.arm64.ldxr.p0i64(i64* %addr)
+  store i64 %val, i64* @var, align 8
+  ret void
+}
+
+
+declare i64 @llvm.arm64.ldxr.p0i8(i8*) nounwind
+declare i64 @llvm.arm64.ldxr.p0i16(i16*) nounwind
+declare i64 @llvm.arm64.ldxr.p0i32(i32*) nounwind
+declare i64 @llvm.arm64.ldxr.p0i64(i64*) nounwind
+
+define i32 @test_store_i8(i32, i8 %val, i8* %addr) {
+; CHECK-LABEL: test_store_i8:
+; CHECK-NOT: uxtb
+; CHECK-NOT: and
+; CHECK: stxrb w0, w1, [x2]
+  %extval = zext i8 %val to i64
+  %res = call i32 @llvm.arm64.stxr.p0i8(i64 %extval, i8* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i16(i32, i16 %val, i16* %addr) {
+; CHECK-LABEL: test_store_i16:
+; CHECK-NOT: uxth
+; CHECK-NOT: and
+; CHECK: stxrh w0, w1, [x2]
+  %extval = zext i16 %val to i64
+  %res = call i32 @llvm.arm64.stxr.p0i16(i64 %extval, i16* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i32(i32, i32 %val, i32* %addr) {
+; CHECK-LABEL: test_store_i32:
+; CHECK-NOT: uxtw
+; CHECK-NOT: and
+; CHECK: stxr w0, w1, [x2]
+  %extval = zext i32 %val to i64
+  %res = call i32 @llvm.arm64.stxr.p0i32(i64 %extval, i32* %addr)
+  ret i32 %res
+}
+
+define i32 @test_store_i64(i32, i64 %val, i64* %addr) {
+; CHECK-LABEL: test_store_i64:
+; CHECK: stxr w0, x1, [x2]
+  %res = call i32 @llvm.arm64.stxr.p0i64(i64 %val, i64* %addr)
+  ret i32 %res
+}
+
+declare i32 @llvm.arm64.stxr.p0i8(i64, i8*) nounwind
+declare i32 @llvm.arm64.stxr.p0i16(i64, i16*) nounwind
+declare i32 @llvm.arm64.stxr.p0i32(i64, i32*) nounwind
+declare i32 @llvm.arm64.stxr.p0i64(i64, i64*) nounwind
+
+; CHECK: test_clear:
+; CHECK: clrex
+define void @test_clear() {
+  call void @llvm.arm64.clrex()
+  ret void
+}
+
+declare void @llvm.arm64.clrex() nounwind
+
diff --git a/test/CodeGen/ARM64/leaf-compact-unwind.ll b/test/CodeGen/ARM64/leaf-compact-unwind.ll
new file mode 100644
index 0000000000..0a587173d3
--- /dev/null
+++ b/test/CodeGen/ARM64/leaf-compact-unwind.ll
@@ -0,0 +1,161 @@
+; Use the -disable-cfi flag so that we get the compact unwind info in the
+; emitted assembly. Compact unwind info is omitted when CFI directives
+; are emitted.
+;
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios -disable-cfi < %s | FileCheck %s
+;
+; rdar://13070556
+
+@bar = common global i32 0, align 4
+
+; Leaf function with no stack allocation and no saving/restoring
+; of non-volatile registers.
+define i32 @foo1(i32 %a) #0 {
+entry:
+  %add = add nsw i32 %a, 42
+  ret i32 %add
+}
+
+; Leaf function with stack allocation but no saving/restoring
+; of non-volatile registers.
+define i32 @foo2(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #0 {
+entry:
+  %stack = alloca [36 x i32], align 4
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv19 = phi i64 [ 0, %entry ], [ %indvars.iv.next20, %for.body ]
+  %arrayidx = getelementptr inbounds [36 x i32]* %stack, i64 0, i64 %indvars.iv19
+  %0 = trunc i64 %indvars.iv19 to i32
+  store i32 %0, i32* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next20 = add i64 %indvars.iv19, 1
+  %lftr.wideiv21 = trunc i64 %indvars.iv.next20 to i32
+  %exitcond22 = icmp eq i32 %lftr.wideiv21, 36
+  br i1 %exitcond22, label %for.body4, label %for.body
+
+for.body4:                                        ; preds = %for.body, %for.body4
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body4 ], [ 0, %for.body ]
+  %z1.016 = phi i32 [ %add, %for.body4 ], [ 0, %for.body ]
+  %arrayidx6 = getelementptr inbounds [36 x i32]* %stack, i64 0, i64 %indvars.iv
+  %1 = load i32* %arrayidx6, align 4, !tbaa !0
+  %add = add nsw i32 %1, %z1.016
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 36
+  br i1 %exitcond, label %for.end9, label %for.body4
+
+for.end9:                                         ; preds = %for.body4
+  ret i32 %add
+}
+
+; Leaf function with no stack allocation but with saving restoring of
+; non-volatile registers.
+define i32 @foo3(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #1 {
+entry:
+  %0 = load volatile i32* @bar, align 4, !tbaa !0
+  %1 = load volatile i32* @bar, align 4, !tbaa !0
+  %2 = load volatile i32* @bar, align 4, !tbaa !0
+  %3 = load volatile i32* @bar, align 4, !tbaa !0
+  %4 = load volatile i32* @bar, align 4, !tbaa !0
+  %5 = load volatile i32* @bar, align 4, !tbaa !0
+  %6 = load volatile i32* @bar, align 4, !tbaa !0
+  %7 = load volatile i32* @bar, align 4, !tbaa !0
+  %8 = load volatile i32* @bar, align 4, !tbaa !0
+  %9 = load volatile i32* @bar, align 4, !tbaa !0
+  %10 = load volatile i32* @bar, align 4, !tbaa !0
+  %11 = load volatile i32* @bar, align 4, !tbaa !0
+  %12 = load volatile i32* @bar, align 4, !tbaa !0
+  %13 = load volatile i32* @bar, align 4, !tbaa !0
+  %14 = load volatile i32* @bar, align 4, !tbaa !0
+  %15 = load volatile i32* @bar, align 4, !tbaa !0
+  %16 = load volatile i32* @bar, align 4, !tbaa !0
+  %17 = load volatile i32* @bar, align 4, !tbaa !0
+  %factor = mul i32 %h, -2
+  %factor56 = mul i32 %g, -2
+  %factor57 = mul i32 %f, -2
+  %factor58 = mul i32 %e, -2
+  %factor59 = mul i32 %d, -2
+  %factor60 = mul i32 %c, -2
+  %factor61 = mul i32 %b, -2
+  %sum = add i32 %1, %0
+  %sum62 = add i32 %sum, %2
+  %sum63 = add i32 %sum62, %3
+  %sum64 = add i32 %sum63, %4
+  %sum65 = add i32 %sum64, %5
+  %sum66 = add i32 %sum65, %6
+  %sum67 = add i32 %sum66, %7
+  %sum68 = add i32 %sum67, %8
+  %sum69 = add i32 %sum68, %9
+  %sum70 = add i32 %sum69, %10
+  %sum71 = add i32 %sum70, %11
+  %sum72 = add i32 %sum71, %12
+  %sum73 = add i32 %sum72, %13
+  %sum74 = add i32 %sum73, %14
+  %sum75 = add i32 %sum74, %15
+  %sum76 = add i32 %sum75, %16
+  %sub10 = sub i32 %17, %sum76
+  %sub11 = add i32 %sub10, %factor
+  %sub12 = add i32 %sub11, %factor56
+  %sub13 = add i32 %sub12, %factor57
+  %sub14 = add i32 %sub13, %factor58
+  %sub15 = add i32 %sub14, %factor59
+  %sub16 = add i32 %sub15, %factor60
+  %add = add i32 %sub16, %factor61
+  ret i32 %add
+}
+
+; Leaf function with stack allocation and saving/restoring of non-volatile
+; registers.
+define i32 @foo4(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) #0 {
+entry:
+  %stack = alloca [128 x i32], align 4
+  %0 = zext i32 %a to i64
+  br label %for.body
+
+for.cond2.preheader:                              ; preds = %for.body
+  %1 = sext i32 %f to i64
+  br label %for.body4
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv22 = phi i64 [ 0, %entry ], [ %indvars.iv.next23, %for.body ]
+  %2 = add nsw i64 %indvars.iv22, %0
+  %arrayidx = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %indvars.iv22
+  %3 = trunc i64 %2 to i32
+  store i32 %3, i32* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next23 = add i64 %indvars.iv22, 1
+  %lftr.wideiv25 = trunc i64 %indvars.iv.next23 to i32
+  %exitcond26 = icmp eq i32 %lftr.wideiv25, 128
+  br i1 %exitcond26, label %for.cond2.preheader, label %for.body
+
+for.body4:                                        ; preds = %for.body4, %for.cond2.preheader
+  %indvars.iv = phi i64 [ 0, %for.cond2.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %z1.018 = phi i32 [ 0, %for.cond2.preheader ], [ %add8, %for.body4 ]
+  %4 = add nsw i64 %indvars.iv, %1
+  %arrayidx7 = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %4
+  %5 = load i32* %arrayidx7, align 4, !tbaa !0
+  %add8 = add nsw i32 %5, %z1.018
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 128
+  br i1 %exitcond, label %for.end11, label %for.body4
+
+for.end11:                                        ; preds = %for.body4
+  ret i32 %add8
+}
+
+attributes #0 = { readnone "target-cpu"="cyclone" }
+attributes #1 = { "target-cpu"="cyclone" }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+
+; CHECK:        .section        __LD,__compact_unwind,regular,debug
+; CHECK:        .quad   _foo1                   ; Range Start
+; CHECK:        .long   33554432                ; Compact Unwind Encoding: 0x2000000
+; CHECK:        .quad   _foo2                   ; Range Start
+; CHECK:        .long   33591296                ; Compact Unwind Encoding: 0x2009000
+; CHECK:        .quad   _foo3                   ; Range Start
+; CHECK:        .long   33570831                ; Compact Unwind Encoding: 0x200400f
+; CHECK:        .quad   _foo4                   ; Range Start
+; CHECK:        .long   33689616                ; Compact Unwind Encoding: 0x2021010
diff --git a/test/CodeGen/ARM64/leaf.ll b/test/CodeGen/ARM64/leaf.ll
new file mode 100644
index 0000000000..d3b2031686
--- /dev/null
+++ b/test/CodeGen/ARM64/leaf.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+; rdar://12829704
+
+define void @t8() nounwind ssp {
+; CHECK-LABEL: t8:
+; CHECK-NOT: stp	fp, lr, [sp, #-16]!
+; CHECK-NOT: mov	fp, sp
+; CHECK: nop
+; CHECK-NOT: mov	sp, fp
+; CHECK-NOT: ldp	fp, lr, [sp], #16
+  tail call void asm sideeffect "nop", "~{v8}"() nounwind
+  ret void
+}
diff --git a/test/CodeGen/ARM64/lit.local.cfg b/test/CodeGen/ARM64/lit.local.cfg
new file mode 100644
index 0000000000..de86e54852
--- /dev/null
+++ b/test/CodeGen/ARM64/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'ARM64' in targets:
+    config.unsupported = True
+
diff --git a/test/CodeGen/ARM64/long-shift.ll b/test/CodeGen/ARM64/long-shift.ll
new file mode 100644
index 0000000000..6f37044d1a
--- /dev/null
+++ b/test/CodeGen/ARM64/long-shift.ll
@@ -0,0 +1,59 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+define i128 @shl(i128 %r, i128 %s) nounwind readnone {
+; CHECK-LABEL: shl:
+; CHECK: lslv  [[XREG_0:x[0-9]+]], x1, x2
+; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
+; CHECK-NEXT: lsrv  [[XREG_3:x[0-9]+]], x0, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_6:x[0-9]+]], [[XREG_3]], [[XREG_0]]
+; CHECK-NEXT: sub [[XREG_4:x[0-9]+]], x2, #64
+; CHECK-NEXT: lslv  [[XREG_5:x[0-9]+]], x0, [[XREG_4]]
+; CHECK-NEXT: cmp   [[XREG_4]], #0
+; CHECK-NEXT: csel  x1, [[XREG_5]], [[XREG_6]], ge
+; CHECK-NEXT: lslv  [[SMALLSHIFT_LO:x[0-9]+]], x0, x2
+; CHECK-NEXT: csel  x0, xzr, [[SMALLSHIFT_LO]], ge
+; CHECK-NEXT: ret
+
+  %shl = shl i128 %r, %s
+  ret i128 %shl
+}
+
+define i128 @ashr(i128 %r, i128 %s) nounwind readnone {
+; CHECK: ashr:
+; CHECK: lsrv  [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
+; CHECK-NEXT: lslv  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
+; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
+; CHECK-NEXT: asrv  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: cmp   [[XREG_5]], #0
+; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
+; CHECK-NEXT: asrv  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: asr [[BIGSHIFT_HI:x[0-9]+]], x1, #63
+; CHECK-NEXT: csel x1, [[BIGSHIFT_HI]], [[SMALLSHIFT_HI]], ge
+; CHECK-NEXT: ret
+
+  %shr = ashr i128 %r, %s
+  ret i128 %shr
+}
+
+define i128 @lshr(i128 %r, i128 %s) nounwind readnone {
+; CHECK: lshr:
+; CHECK: lsrv  [[XREG_0:x[0-9]+]], x0, x2
+; CHECK-NEXT: orr [[XREG_1:x[0-9]+]], xzr, #0x40
+; CHECK-NEXT: sub [[XREG_2:x[0-9]+]], [[XREG_1]], x2
+; CHECK-NEXT: lslv  [[XREG_3:x[0-9]+]], x1, [[XREG_2]]
+; CHECK-NEXT: orr [[XREG_4:x[0-9]+]], [[XREG_0]], [[XREG_3]]
+; CHECK-NEXT: sub [[XREG_5:x[0-9]+]], x2, #64
+; CHECK-NEXT: lsrv  [[XREG_6:x[0-9]+]], x1, [[XREG_5]]
+; CHECK-NEXT: cmp   [[XREG_5]], #0
+; CHECK-NEXT: csel  x0, [[XREG_6]], [[XREG_4]], ge
+; CHECK-NEXT: lsrv  [[SMALLSHIFT_HI:x[0-9]+]], x1, x2
+; CHECK-NEXT: csel x1, xzr, [[SMALLSHIFT_HI]], ge
+; CHECK-NEXT: ret
+
+  %shr = lshr i128 %r, %s
+  ret i128 %shr
+}
diff --git a/test/CodeGen/ARM64/memcpy-inline.ll b/test/CodeGen/ARM64/memcpy-inline.ll
new file mode 100644
index 0000000000..26f5166894
--- /dev/null
+++ b/test/CodeGen/ARM64/memcpy-inline.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -march=arm64 -mcpu=cyclone | FileCheck %s
+
+%struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 }
+
+@src = external global %struct.x
+@dst = external global %struct.x
+
+@.str1 = private unnamed_addr constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 1
+@.str2 = private unnamed_addr constant [36 x i8] c"DHRYSTONE PROGRAM, SOME STRING BLAH\00", align 1
+@.str3 = private unnamed_addr constant [24 x i8] c"DHRYSTONE PROGRAM, SOME\00", align 1
+@.str4 = private unnamed_addr constant [18 x i8] c"DHRYSTONE PROGR  \00", align 1
+@.str5 = private unnamed_addr constant [7 x i8] c"DHRYST\00", align 1
+@.str6 = private unnamed_addr constant [14 x i8] c"/tmp/rmXXXXXX\00", align 1
+@spool.splbuf = internal global [512 x i8] zeroinitializer, align 16
+
+define i32 @t0() {
+entry:
+; CHECK-LABEL: t0:
+; CHECK: ldrb [[REG0:w[0-9]+]], [x[[BASEREG:[0-9]+]], #10]
+; CHECK: strb [[REG0]], [x[[BASEREG2:[0-9]+]], #10]
+; CHECK: ldrh [[REG1:w[0-9]+]], [x[[BASEREG]], #8]
+; CHECK: strh [[REG1]], [x[[BASEREG2]], #8]
+; CHECK: ldr [[REG2:x[0-9]+]],
+; CHECK: str [[REG2]],
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* getelementptr inbounds (%struct.x* @dst, i32 0, i32 0), i8* getelementptr inbounds (%struct.x* @src, i32 0, i32 0), i32 11, i32 8, i1 false)
+  ret i32 0
+}
+
+define void @t1(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldur [[DEST:q[0-9]+]], [x[[BASEREG:[0-9]+]], #15]
+; CHECK: stur [[DEST]], [x0, #15]
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
+; CHECK: str [[DEST]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([31 x i8]* @.str1, i64 0, i64 0), i64 31, i32 1, i1 false)
+  ret void
+}
+
+define void @t2(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: movz [[REG3:w[0-9]+]]
+; CHECK: movk [[REG3]],
+; CHECK: str [[REG3]], [x0, #32]
+; CHECK: ldp [[DEST1:q[0-9]+]], [[DEST2:q[0-9]+]], [x{{[0-9]+}}]
+; CHECK: stp [[DEST1]], [[DEST2]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([36 x i8]* @.str2, i64 0, i64 0), i64 36, i32 1, i1 false)
+  ret void
+}
+
+define void @t3(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldr [[REG4:x[0-9]+]], [x[[BASEREG:[0-9]+]], #16]
+; CHECK: str [[REG4]], [x0, #16]
+; CHECK: ldr [[DEST:q[0-9]+]], [x[[BASEREG]]]
+; CHECK: str [[DEST]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([24 x i8]* @.str3, i64 0, i64 0), i64 24, i32 1, i1 false)
+  ret void
+}
+
+define void @t4(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: orr [[REG5:w[0-9]+]], wzr, #0x20
+; CHECK: strh [[REG5]], [x0, #16]
+; CHECK: ldr [[REG6:q[0-9]+]], [x{{[0-9]+}}]
+; CHECK: str [[REG6]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([18 x i8]* @.str4, i64 0, i64 0), i64 18, i32 1, i1 false)
+  ret void
+}
+
+define void @t5(i8* nocapture %C) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: strb wzr, [x0, #6]
+; CHECK: movz [[REG7:w[0-9]+]], #21587
+; CHECK: strh [[REG7]], [x0, #4]
+; CHECK: movz [[REG8:w[0-9]+]],
+; CHECK: movk [[REG8]],
+; CHECK: str [[REG8]], [x0]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
+  ret void
+}
+
+define void @t6() nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: ldur [[REG9:x[0-9]+]], [x{{[0-9]+}}, #6]
+; CHECK: stur [[REG9]], [x{{[0-9]+}}, #6]
+; CHECK: ldr
+; CHECK: str
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* getelementptr inbounds ([512 x i8]* @spool.splbuf, i64 0, i64 0), i8* getelementptr inbounds ([14 x i8]* @.str6, i64 0, i64 0), i64 14, i32 1, i1 false)
+  ret void
+}
+
+%struct.Foo = type { i32, i32, i32, i32 }
+
+define void @t7(%struct.Foo* nocapture %a, %struct.Foo* nocapture %b) nounwind {
+entry:
+; CHECK: t7
+; CHECK: ldr [[REG10:q[0-9]+]], [x1]
+; CHECK: str [[REG10]], [x0]
+  %0 = bitcast %struct.Foo* %a to i8*
+  %1 = bitcast %struct.Foo* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 16, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM64/memset-inline.ll b/test/CodeGen/ARM64/memset-inline.ll
new file mode 100644
index 0000000000..2e237f4a88
--- /dev/null
+++ b/test/CodeGen/ARM64/memset-inline.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define void @t1(i8* nocapture %c) nounwind optsize {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: str wzr, [x0, #8]
+; CHECK: str xzr, [x0]
+  call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
+  ret void
+}
+
+define void @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: strh wzr, [sp, #32]
+; CHECK: stp xzr, xzr, [sp, #16]
+; CHECK: str xzr, [sp, #8]
+  %buf = alloca [26 x i8], align 1
+  %0 = getelementptr inbounds [26 x i8]* %buf, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
+  call void @something(i8* %0) nounwind
+  ret void
+}
+
+declare void @something(i8*) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/CodeGen/ARM64/memset-to-bzero.ll b/test/CodeGen/ARM64/memset-to-bzero.ll
new file mode 100644
index 0000000000..b28122cccd
--- /dev/null
+++ b/test/CodeGen/ARM64/memset-to-bzero.ll
@@ -0,0 +1,101 @@
+; RUN: llc %s -march arm64 -o - | FileCheck %s
+; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
+
+; CHECK: @fct1
+; For small size (<= 256), we do not change memset to bzero.
+; CHECK: memset
+define void @fct1(i8* nocapture %ptr) {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+; CHECK: @fct2
+; When the size is bigger than 256, change into bzero.
+; CHECK: bzero
+define void @fct2(i8* nocapture %ptr) {
+entry:
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: @fct3
+; For unknown size, change to bzero.
+; CHECK: bzero
+define void @fct3(i8* nocapture %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i32 1, i1 false)
+  ret void
+}
+
+; CHECK: @fct4
+; Size <= 256, no change.
+; CHECK: memset
+define void @fct4(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp)
+  ret void
+}
+
+declare i8* @__memset_chk(i8*, i32, i64, i64)
+
+declare i64 @llvm.objectsize.i64(i8*, i1)
+
+; CHECK: @fct5
+; Size > 256, change.
+; CHECK: bzero
+define void @fct5(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct6
+; Size = unknown, change.
+; CHECK: bzero
+define void @fct6(i8* %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 %conv, i64 %tmp)
+  ret void
+}
+
+; Next functions check that memset is not turned into bzero
+; when the set constant is non-zero, whatever the given size.
+
+; CHECK: @fct7
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct7(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct8
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct8(i8* %ptr) {
+entry:
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp)
+  ret void
+}
+
+; CHECK: @fct9
+; memset with something that is not a zero, no change.
+; CHECK: memset
+define void @fct9(i8* %ptr, i32 %unknown) {
+entry:
+  %conv = sext i32 %unknown to i64
+  %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 %conv, i64 %tmp)
+  ret void
+}
diff --git a/test/CodeGen/ARM64/movi.ll b/test/CodeGen/ARM64/movi.ll
new file mode 100644
index 0000000000..8fcecccd5b
--- /dev/null
+++ b/test/CodeGen/ARM64/movi.ll
@@ -0,0 +1,202 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+;==--------------------------------------------------------------------------==
+; Tests for MOV-immediate implemented with ORR-immediate.
+;==--------------------------------------------------------------------------==
+
+; 64-bit immed with 32-bit pattern size, rotated by 0.
+define i64 @test64_32_rot0() nounwind {
+; CHECK: test64_32_rot0
+; CHECK: orr x0, xzr, #0x700000007
+  ret i64 30064771079
+}
+
+; 64-bit immed with 32-bit pattern size, rotated by 2.
+define i64 @test64_32_rot2() nounwind {
+; CHECK: test64_32_rot2
+; CHECK: orr x0, xzr, #0xc0000003c0000003
+  ret i64 13835058071388291075
+}
+
+; 64-bit immed with 4-bit pattern size, rotated by 3.
+define i64 @test64_4_rot3() nounwind {
+; CHECK: test64_4_rot3
+; CHECK: orr  x0, xzr, #0xeeeeeeeeeeeeeeee
+  ret i64 17216961135462248174
+}
+
+; 32-bit immed with 32-bit pattern size, rotated by 16.
+define i32 @test32_32_rot16() nounwind {
+; CHECK: test32_32_rot16
+; CHECK: orr w0, wzr, #0xff0000
+  ret i32 16711680
+}
+
+; 32-bit immed with 2-bit pattern size, rotated by 1.
+define i32 @test32_2_rot1() nounwind {
+; CHECK: test32_2_rot1
+; CHECK: orr w0, wzr, #0xaaaaaaaa
+  ret i32 2863311530
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for MOVZ with MOVK.
+;==--------------------------------------------------------------------------==
+
+define i32 @movz() nounwind {
+; CHECK: movz
+; CHECK: movz w0, #5
+  ret i32 5
+}
+
+define i64 @movz_3movk() nounwind {
+; CHECK: movz_3movk
+; CHECK:      movz x0, #5, lsl #48
+; CHECK-NEXT: movk x0, #4660, lsl #32
+; CHECK-NEXT: movk x0, #43981, lsl #16
+; CHECK-NEXT: movk x0, #22136
+  ret i64 1427392313513592
+}
+
+define i64 @movz_movk_skip1() nounwind {
+; CHECK: movz_movk_skip1
+; CHECK:      movz x0, #5, lsl #32
+; CHECK-NEXT: movk x0, #17185, lsl #16
+  ret i64 22601072640
+}
+
+define i64 @movz_skip1_movk() nounwind {
+; CHECK: movz_skip1_movk
+; CHECK:      movz x0, #34388, lsl #32
+; CHECK-NEXT: movk x0, #4660
+  ret i64 147695335379508
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for MOVN with MOVK.
+;==--------------------------------------------------------------------------==
+
+define i64 @movn() nounwind {
+; CHECK: movn
+; CHECK: movn x0, #41
+  ret i64 -42
+}
+
+define i64 @movn_skip1_movk() nounwind {
+; CHECK: movn_skip1_movk
+; CHECK:      movn x0, #41, lsl #32
+; CHECK-NEXT: movk x0, #4660
+  ret i64 -176093720012
+}
+
+;==--------------------------------------------------------------------------==
+; Tests for ORR with MOVK.
+;==--------------------------------------------------------------------------==
+; rdar://14987673
+
+define i64 @orr_movk1() nounwind {
+; CHECK: orr_movk1
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #57005, lsl #16
+  ret i64 72056498262245120
+}
+
+define i64 @orr_movk2() nounwind {
+; CHECK: orr_movk2
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2400982650836746496
+}
+
+define i64 @orr_movk3() nounwind {
+; CHECK: orr_movk3
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #57005, lsl #32
+  ret i64 72020953688702720
+}
+
+define i64 @orr_movk4() nounwind {
+; CHECK: orr_movk4
+; CHECK: orr x0, xzr, #0xffff0000ffff0
+; CHECK: movk x0, #57005
+  ret i64 72056494543068845
+}
+
+; rdar://14987618
+define i64 @orr_movk5() nounwind {
+; CHECK: orr_movk5
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #57005, lsl #16
+  ret i64 -71777214836900096
+}
+
+define i64 @orr_movk6() nounwind {
+; CHECK: orr_movk6
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #57005, lsl #16
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2400982647117578496
+}
+
+define i64 @orr_movk7() nounwind {
+; CHECK: orr_movk7
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2400982646575268096
+}
+
+define i64 @orr_movk8() nounwind {
+; CHECK: orr_movk8
+; CHECK: orr x0, xzr, #0xff00ff00ff00ff00
+; CHECK: movk x0, #57005
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2400982646575276371
+}
+
+; rdar://14987715
+define i64 @orr_movk9() nounwind {
+; CHECK: orr_movk9
+; CHECK: orr x0, xzr, #0xffffff000000000
+; CHECK: movk x0, #65280
+; CHECK: movk x0, #57005, lsl #16
+  ret i64 1152921439623315200
+}
+
+define i64 @orr_movk10() nounwind {
+; CHECK: orr_movk10
+; CHECK: orr x0, xzr, #0xfffffffffffff00
+; CHECK: movk x0, #57005, lsl #16
+  ret i64 1152921504047824640
+}
+
+define i64 @orr_movk11() nounwind {
+; CHECK: orr_movk11
+; CHECK: orr x0, xzr, #0xfff00000000000ff
+; CHECK: movk x0, #57005, lsl #16
+; CHECK: movk x0, #65535, lsl #32
+  ret i64 -4222125209747201
+}
+
+define i64 @orr_movk12() nounwind {
+; CHECK: orr_movk12
+; CHECK: orr x0, xzr, #0xfff00000000000ff
+; CHECK: movk x0, #57005, lsl #32
+  ret i64 -4258765016661761
+}
+
+define i64 @orr_movk13() nounwind {
+; CHECK: orr_movk13
+; CHECK: orr x0, xzr, #0xfffff000000
+; CHECK: movk x0, #57005
+; CHECK: movk x0, #57005, lsl #48
+  ret i64 -2401245434149282131
+}
+
+; rdar://13944082
+define i64 @g() nounwind {
+; CHECK: g
+; CHECK: movz x0, #65535, lsl #48
+; CHECK: movk x0, #2
+entry:
+  ret i64 -281474976710654
+}
diff --git a/test/CodeGen/ARM64/mul.ll b/test/CodeGen/ARM64/mul.ll
new file mode 100644
index 0000000000..2e7986d67d
--- /dev/null
+++ b/test/CodeGen/ARM64/mul.ll
@@ -0,0 +1,90 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; rdar://9296808
+; rdar://9349137
+
+define i128 @t1(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: umulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = zext i64 %a to i128
+  %tmp2 = zext i64 %b to i128
+  %tmp3 = mul i128 %tmp1, %tmp2
+  ret i128 %tmp3
+}
+
+define i128 @t2(i64 %a, i64 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECK: smulh {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = sext i64 %a to i128
+  %tmp2 = sext i64 %b to i128
+  %tmp3 = mul i128 %tmp1, %tmp2
+  ret i128 %tmp3
+}
+
+define i64 @t3(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: umull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @t4(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: smull {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  ret i64 %tmp3
+}
+
+define i64 @t5(i32 %a, i32 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: umaddl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = add i64 %c, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t6(i32 %a, i32 %b, i64 %c) nounwind {
+entry:
+; CHECK-LABEL: t6:
+; CHECK: smsubl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}, {{x[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 %c, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t7(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t7:
+; CHECK: umnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = zext i32 %a to i64
+  %tmp2 = zext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 0, %tmp3
+  ret i64 %tmp4
+}
+
+define i64 @t8(i32 %a, i32 %b) nounwind {
+entry:
+; CHECK-LABEL: t8:
+; CHECK: smnegl {{x[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+  %tmp1 = sext i32 %a to i64
+  %tmp2 = sext i32 %b to i64
+  %tmp3 = mul i64 %tmp1, %tmp2
+  %tmp4 = sub i64 0, %tmp3
+  ret i64 %tmp4
+}
diff --git a/test/CodeGen/ARM64/neon-compare-instructions.ll b/test/CodeGen/ARM64/neon-compare-instructions.ll
new file mode 100644
index 0000000000..55f7b99cd6
--- /dev/null
+++ b/test/CodeGen/ARM64/neon-compare-instructions.ll
@@ -0,0 +1,1191 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu < %s | FileCheck %s
+
+define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp eq <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeq16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp eq <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeq4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp eq <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeq8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp eq <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeq2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp eq <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeq4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp eq <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeq2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp eq <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmne8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmne16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmne4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmne8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmne2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmne4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmne2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmgt8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp sgt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgt16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp sgt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgt4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp sgt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgt8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp sgt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgt2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp sgt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgt4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp sgt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgt2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp sgt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlt8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp slt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlt16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp slt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlt4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp slt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlt8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp slt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlt2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp slt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlt4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp slt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlt2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LT implemented as GT, so check reversed operands.
+;CHECK: cmgt {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp slt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmge8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp sge <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmge16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp sge <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmge4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp sge <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmge8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp sge <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmge2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp sge <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmge4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp sge <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmge2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp sge <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmle8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp sle <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmle16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp sle <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmle4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp sle <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmle8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp sle <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmle2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp sle <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmle4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp sle <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmle2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LE implemented as GE, so check reversed operands.
+;CHECK: cmge {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp sle <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhi8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ugt <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhi16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ugt <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhi4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp ugt <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhi8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp ugt <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhi2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp ugt <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhi4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp ugt <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhi2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp ugt <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlo8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp ult <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlo16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ult <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlo4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ult <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlo8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ult <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlo2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ult <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlo4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ult <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlo2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: cmhi {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ult <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhs8xi8(<8 x i8> %A, <8 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp uge <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhs16xi8(<16 x i8> %A, <16 x i8> %B) {
+;CHECK: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp uge <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhs4xi16(<4 x i16> %A, <4 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h
+	%tmp3 = icmp uge <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhs8xi16(<8 x i16> %A, <8 x i16> %B) {
+;CHECK: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h
+	%tmp3 = icmp uge <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhs2xi32(<2 x i32> %A, <2 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp3 = icmp uge <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhs4xi32(<4 x i32> %A, <4 x i32> %B) {
+;CHECK: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s
+	%tmp3 = icmp uge <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhs2xi64(<2 x i64> %A, <2 x i64> %B) {
+;CHECK: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d
+	%tmp3 = icmp uge <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmls8xi8(<8 x i8> %A, <8 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8b, v1.8b, v0.8b
+	%tmp3 = icmp ule <8 x i8> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmls16xi8(<16 x i8> %A, <16 x i8> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.16b, v1.16b, v0.16b
+	%tmp3 = icmp ule <16 x i8> %A, %B;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmls4xi16(<4 x i16> %A, <4 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4h, v1.4h, v0.4h
+	%tmp3 = icmp ule <4 x i16> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmls8xi16(<8 x i16> %A, <8 x i16> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.8h, v1.8h, v0.8h
+	%tmp3 = icmp ule <8 x i16> %A, %B;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmls2xi32(<2 x i32> %A, <2 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2s, v1.2s, v0.2s
+	%tmp3 = icmp ule <2 x i32> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmls4xi32(<4 x i32> %A, <4 x i32> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.4s, v1.4s, v0.4s
+	%tmp3 = icmp ule <4 x i32> %A, %B;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmls2xi64(<2 x i64> %A, <2 x i64> %B) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: cmhs {{v[0-9]+}}.2d, v1.2d, v0.2d
+	%tmp3 = icmp ule <2 x i64> %A, %B;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmeqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp eq <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmeqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp eq <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmeqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp eq <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmeqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp eq <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmeqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp eq <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmeqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp eq <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmeqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp eq <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgez8xi8(<8 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sge <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgez16xi8(<16 x i8> %A) {
+;CHECK: cmge {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sge <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgez4xi16(<4 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sge <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgez8xi16(<8 x i16> %A) {
+;CHECK: cmge {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sge <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgez2xi32(<2 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sge <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgez4xi32(<4 x i32> %A) {
+;CHECK: cmge {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sge <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgez2xi64(<2 x i64> %A) {
+;CHECK: cmge {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sge <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmgtz8xi8(<8 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sgt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmgtz16xi8(<16 x i8> %A) {
+;CHECK: cmgt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sgt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmgtz4xi16(<4 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sgt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmgtz8xi16(<8 x i16> %A) {
+;CHECK: cmgt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sgt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmgtz2xi32(<2 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sgt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmgtz4xi32(<4 x i32> %A) {
+;CHECK: cmgt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sgt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmgtz2xi64(<2 x i64> %A) {
+;CHECK: cmgt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sgt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlez8xi8(<8 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp sle <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlez16xi8(<16 x i8> %A) {
+;CHECK: cmle {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp sle <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlez4xi16(<4 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp sle <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlez8xi16(<8 x i16> %A) {
+;CHECK: cmle {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp sle <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlez2xi32(<2 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp sle <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlez4xi32(<4 x i32> %A) {
+;CHECK: cmle {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp sle <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlez2xi64(<2 x i64> %A) {
+;CHECK: cmle {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp sle <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmltz8xi8(<8 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+	%tmp3 = icmp slt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmltz16xi8(<16 x i8> %A) {
+;CHECK: cmlt {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+	%tmp3 = icmp slt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmltz4xi16(<4 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+	%tmp3 = icmp slt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmltz8xi16(<8 x i16> %A) {
+;CHECK: cmlt {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+	%tmp3 = icmp slt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmltz2xi32(<2 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+	%tmp3 = icmp slt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmltz4xi32(<4 x i32> %A) {
+;CHECK: cmlt {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+	%tmp3 = icmp slt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmltz2xi64(<2 x i64> %A) {
+;CHECK: cmlt {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+	%tmp3 = icmp slt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmneqz8xi8(<8 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, #0
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmneqz16xi8(<16 x i8> %A) {
+;CHECK: cmeq {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #0
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmneqz4xi16(<4 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, #0
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmneqz8xi16(<8 x i16> %A) {
+;CHECK: cmeq {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, #0
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmneqz2xi32(<2 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, #0
+;CHECK-NEXT: not {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ne <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmneqz4xi32(<4 x i32> %A) {
+;CHECK: cmeq {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, #0
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmneqz2xi64(<2 x i64> %A) {
+;CHECK: cmeq {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, #0
+;CHECK-NEXT: not {{v[0-9]+}}.16b, {{v[0-9]+}}.16b
+	%tmp3 = icmp ne <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmhsz8xi8(<8 x i8> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
+	%tmp3 = icmp uge <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhsz16xi8(<16 x i8> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
+	%tmp3 = icmp uge <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhsz4xi16(<4 x i16> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
+	%tmp3 = icmp uge <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhsz8xi16(<8 x i16> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
+	%tmp3 = icmp uge <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhsz2xi32(<2 x i32> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
+	%tmp3 = icmp uge <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhsz4xi32(<4 x i32> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
+	%tmp3 = icmp uge <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhsz2xi64(<2 x i64> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
+	%tmp3 = icmp uge <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+
+define <8 x i8> @cmhiz8xi8(<8 x i8> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, v[[ZERO]].8b
+	%tmp3 = icmp ugt <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmhiz16xi8(<16 x i8> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, v[[ZERO]].16b
+	%tmp3 = icmp ugt <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmhiz4xi16(<4 x i16> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, v[[ZERO]].4h
+	%tmp3 = icmp ugt <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmhiz8xi16(<8 x i16> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, v[[ZERO]].8h
+	%tmp3 = icmp ugt <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmhiz2xi32(<2 x i32> %A) {
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, v[[ZERO]].2s
+	%tmp3 = icmp ugt <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmhiz4xi32(<4 x i32> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, v[[ZERO]].4s
+	%tmp3 = icmp ugt <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmhiz2xi64(<2 x i64> %A) {
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, v[[ZERO]].2d
+	%tmp3 = icmp ugt <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmlsz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8b, v[[ZERO]].8b, v0.8b
+	%tmp3 = icmp ule <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmlsz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
+	%tmp3 = icmp ule <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmlsz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
+	%tmp3 = icmp ule <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmlsz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
+	%tmp3 = icmp ule <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmlsz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
+	%tmp3 = icmp ule <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmlsz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
+	%tmp3 = icmp ule <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmlsz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LS implemented as HS, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhs {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
+	%tmp3 = icmp ule <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @cmloz8xi8(<8 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8b, v[[ZERO]].8b, {{v[0-9]+}}.8b
+	%tmp3 = icmp ult <8 x i8> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i8>
+	ret <8 x i8> %tmp4
+}
+
+define <16 x i8> @cmloz16xi8(<16 x i8> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.16b, v[[ZERO]].16b, v0.16b
+	%tmp3 = icmp ult <16 x i8> %A, zeroinitializer;
+   %tmp4 = sext <16 x i1> %tmp3 to <16 x i8>
+	ret <16 x i8> %tmp4
+}
+
+define <4 x i16> @cmloz4xi16(<4 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4h, v[[ZERO]].4h, v0.4h
+	%tmp3 = icmp ult <4 x i16> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i16>
+	ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @cmloz8xi16(<8 x i16> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.8h, v[[ZERO]].8h, v0.8h
+	%tmp3 = icmp ult <8 x i16> %A, zeroinitializer;
+   %tmp4 = sext <8 x i1> %tmp3 to <8 x i16>
+	ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @cmloz2xi32(<2 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi d[[ZERO:[0-9]+]], #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2s, v[[ZERO]].2s, v0.2s
+	%tmp3 = icmp ult <2 x i32> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i32>
+	ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @cmloz4xi32(<4 x i32> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.4s, v[[ZERO]].4s, v0.4s
+	%tmp3 = icmp ult <4 x i32> %A, zeroinitializer;
+   %tmp4 = sext <4 x i1> %tmp3 to <4 x i32>
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @cmloz2xi64(<2 x i64> %A) {
+; Using registers other than v0, v1 are possible, but would be odd.
+; LO implemented as HI, so check reversed operands.
+;CHECK: movi v[[ZERO:[0-9]+]].2d, #0
+;CHECK-NEXT: cmhi {{v[0-9]+}}.2d, v[[ZERO]].2d, v0.2d
+	%tmp3 = icmp ult <2 x i64> %A, zeroinitializer;
+   %tmp4 = sext <2 x i1> %tmp3 to <2 x i64>
+	ret <2 x i64> %tmp4
+}
+
+define <1 x i64> @cmeqz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmeqz_v1i64:
+; CHECK: cmeq d0, d0, #0
+  %tst = icmp eq <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmgez_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmgez_v1i64:
+; CHECK: cmge d0, d0, #0
+  %tst = icmp sge <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmgtz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmgtz_v1i64:
+; CHECK: cmgt d0, d0, #0
+  %tst = icmp sgt <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmlez_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmlez_v1i64:
+; CHECK: cmle d0, d0, #0
+  %tst = icmp sle <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @cmltz_v1i64(<1 x i64> %A) {
+; CHECK-LABEL: cmltz_v1i64:
+; CHECK: cmlt d0, d0, #0
+  %tst = icmp slt <1 x i64> %A, <i64 0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmeqz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmeqz_v1f64:
+; CHECK: fcmeq d0, d0, #0
+  %tst = fcmp oeq <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgez_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmgez_v1f64:
+; CHECK: fcmge d0, d0, #0
+  %tst = fcmp oge <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgtz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmgtz_v1f64:
+; CHECK: fcmgt d0, d0, #0
+  %tst = fcmp ogt <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmlez_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmlez_v1f64:
+; CHECK: fcmle d0, d0, #0
+  %tst = fcmp ole <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmltz_v1f64(<1 x double> %A) {
+; CHECK-LABEL: fcmltz_v1f64:
+; CHECK: fcmlt d0, d0, #0
+  %tst = fcmp olt <1 x double> %A, <double 0.0>
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
diff --git a/test/CodeGen/ARM64/patchpoint.ll b/test/CodeGen/ARM64/patchpoint.ll
new file mode 100644
index 0000000000..993e3eb233
--- /dev/null
+++ b/test/CodeGen/ARM64/patchpoint.ll
@@ -0,0 +1,163 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin -enable-misched=0 | FileCheck %s
+
+; Trivial patchpoint codegen
+;
+define i64 @trivial_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: trivial_patchpoint_codegen:
+; CHECK:       movz x16, #57005, lsl #32
+; CHECK-NEXT:  movk x16, #48879, lsl #16
+; CHECK-NEXT:  movk x16, #51966
+; CHECK-NEXT:  blr  x16
+; CHECK:       movz x16, #57005, lsl #32
+; CHECK-NEXT:  movk x16, #48879, lsl #16
+; CHECK-NEXT:  movk x16, #51967
+; CHECK-NEXT:  blr  x16
+; CHECK:       ret
+  %resolveCall2 = inttoptr i64 244837814094590 to i8*
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 20, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %resolveCall3 = inttoptr i64 244837814094591 to i8*
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 20, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  ret i64 %result
+}
+
+; Caller frame metadata with stackmaps. This should not be optimized
+; as a leaf function.
+;
+; CHECK-LABEL: caller_meta_leaf
+; CHECK:       mov fp, sp
+; CHECK-NEXT:  sub sp, sp, #32
+; CHECK:       Ltmp
+; CHECK:       mov sp, fp
+; CHECK:       ret
+
+define void @caller_meta_leaf() {
+entry:
+  %metadata = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata
+  store i64 12, i64* %metadata
+  store i64 13, i64* %metadata
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
+  ret void
+}
+
+; Test the webkit_jscc calling convention.
+; One argument will be passed in register, the other will be pushed on the stack.
+; Return value in x0.
+define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      str x{{.+}}, [sp]
+; CHECK-NEXT: mov  x0, x{{.+}}
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #65535, lsl #32
+; CHECK-NEXT: movk  x16, #57005, lsl #16
+; CHECK-NEXT: movk  x16, #48879
+; CHECK-NEXT: blr x16
+  %resolveCall2 = inttoptr i64 281474417671919 to i8*
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
+  %resolveCall3 = inttoptr i64 244837814038255 to i8*
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
+  ret void
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK:      Ltmp
+; CHECK:      orr x{{.+}}, xzr, #0x6
+; CHECK-NEXT: str x{{.+}}, [sp, #24]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
+; CHECK-NEXT: str w{{.+}}, [sp, #16]
+; CHECK-NEXT: orr x{{.+}}, xzr, #0x2
+; CHECK-NEXT: str x{{.+}}, [sp]
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #65535, lsl #32
+; CHECK-NEXT: movk  x16, #57005, lsl #16
+; CHECK-NEXT: movk  x16, #48879
+; CHECK-NEXT: blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK:      Ltmp
+; CHECK:      movz  x{{.+}}, #10
+; CHECK-NEXT: str x{{.+}}, [sp, #48]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x8
+; CHECK-NEXT: str w{{.+}}, [sp, #36]
+; CHECK-NEXT: orr x{{.+}}, xzr, #0x6
+; CHECK-NEXT: str x{{.+}}, [sp, #24]
+; CHECK-NEXT: orr w{{.+}}, wzr, #0x4
+; CHECK-NEXT: str w{{.+}}, [sp, #16]
+; CHECK-NEXT: orr x{{.+}}, xzr, #0x2
+; CHECK-NEXT: str x{{.+}}, [sp]
+; CHECK:      Ltmp
+; CHECK-NEXT: movz  x16, #65535, lsl #32
+; CHECK-NEXT: movk  x16, #57005, lsl #16
+; CHECK-NEXT: movk  x16, #48879
+; CHECK-NEXT: blr x16
+  %call = inttoptr i64 281474417671919 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 20, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  ret i64 %result
+}
+
+; Test patchpoints reusing the same TargetConstant.
+; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
+; There is no way to verify this, since it depends on memory allocation.
+; But I think it's useful to include as a working example.
+define i64 @testLowerConstant(i64 %arg, i64 %tmp2, i64 %tmp10, i64* %tmp33, i64 %tmp79) {
+entry:
+  %tmp80 = add i64 %tmp79, -16
+  %tmp81 = inttoptr i64 %tmp80 to i64*
+  %tmp82 = load i64* %tmp81, align 8
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 8, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  %tmp83 = load i64* %tmp33, align 8
+  %tmp84 = add i64 %tmp83, -24
+  %tmp85 = inttoptr i64 %tmp84 to i64*
+  %tmp86 = load i64* %tmp85, align 8
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 8, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 32, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  ret i64 10
+}
+
+; Test small patchpoints that don't emit calls.
+define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
+entry:
+; CHECK-LABEL: small_patchpoint_codegen:
+; CHECK:      Ltmp
+; CHECK:      nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: nop
+; CHECK-NEXT: ldp
+; CHECK-NEXT: ret
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* null, i32 2, i64 %p1, i64 %p2)
+  ret void
+}
+
+; Test that scratch registers are spilled around patchpoints
+; CHECK: InlineAsm End
+; CHECK-NEXT: mov x{{[0-9]+}}, x16
+; CHECK-NEXT: mov x{{[0-9]+}}, x17
+; CHECK-NEXT: Ltmp
+; CHECK-NEXT: nop
+define void @clobberScratch(i32* %p) {
+  %v = load i32* %p
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x30},~{x31}"() nounwind
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 20, i8* null, i32 0, i32* %p, i32 %v)
+  store i32 %v, i32* %p
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/platform-reg.ll b/test/CodeGen/ARM64/platform-reg.ll
new file mode 100644
index 0000000000..651c793f73
--- /dev/null
+++ b/test/CodeGen/ARM64/platform-reg.ll
@@ -0,0 +1,26 @@
+; RUN: llc -mtriple=arm64-apple-ios -o - %s | FileCheck %s --check-prefix=CHECK-DARWIN
+; RUN: llc -mtriple=arm64-linux-gnu -o - %s | FileCheck %s
+
+; x18 is reserved as a platform register on Darwin but not on other
+; systems. Create loads of register pressure and make sure this is respected.
+
+; Also, fp must always refer to a valid frame record, even if it's not the one
+; of the current function, so it shouldn't be used either.
+
+@var = global [30 x i64] zeroinitializer
+
+define void @keep_live() {
+  %val = load volatile [30 x i64]* @var
+  store volatile [30 x i64] %val, [30 x i64]* @var
+
+; CHECK: ldr x18
+; CHECK: str x18
+
+; CHECK-DARWIN-NOT: ldr fp
+; CHECK-DARWIN-NOT: ldr x18
+; CHECK-DARWIN: Spill
+; CHECK-DARWIN-NOT: ldr fp
+; CHECK-DARWIN-NOT: ldr x18
+; CHECK-DARWIN: ret
+  ret void
+}
diff --git a/test/CodeGen/ARM64/popcnt.ll b/test/CodeGen/ARM64/popcnt.ll
new file mode 100644
index 0000000000..9bbba09c25
--- /dev/null
+++ b/test/CodeGen/ARM64/popcnt.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @cnt32_advsimd(i32 %x) nounwind readnone {
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+; CHECK: fmov	s0, w0
+; CHECK: cnt.8b	v0, v0
+; CHECK: uaddlv.8b	h0, v0
+; CHECK: fmov w0, s0
+; CHECK: ret
+}
+
+define i64 @cnt64_advsimd(i64 %x) nounwind readnone {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %cnt
+; CHECK: fmov	d0, x0
+; CHECK: cnt.8b	v0, v0
+; CHECK: uaddlv.8b	h0, v0
+; CHECK: fmov	w0, s0
+; CHECK: ret
+}
+
+; Do not use AdvSIMD when -mno-implicit-float is specified.
+; rdar://9473858
+
+define i32 @cnt32(i32 %x) nounwind readnone noimplicitfloat {
+  %cnt = tail call i32 @llvm.ctpop.i32(i32 %x)
+  ret i32 %cnt
+; CHECK-LABEL: cnt32:
+; CHECK-NOT 16b
+; CHECK: ret
+}
+
+define i64 @cnt64(i64 %x) nounwind readnone noimplicitfloat {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %x)
+  ret i64 %cnt
+; CHECK-LABEL: cnt64:
+; CHECK-NOT 16b
+; CHECK: ret
+}
+
+declare i32 @llvm.ctpop.i32(i32) nounwind readnone
+declare i64 @llvm.ctpop.i64(i64) nounwind readnone
diff --git a/test/CodeGen/ARM64/prefetch.ll b/test/CodeGen/ARM64/prefetch.ll
new file mode 100644
index 0000000000..b2e06edf93
--- /dev/null
+++ b/test/CodeGen/ARM64/prefetch.ll
@@ -0,0 +1,88 @@
+; RUN: llc %s -march arm64 -o - | FileCheck %s
+
+@a = common global i32* null, align 8
+
+define void @test(i32 %i, i32 %j) nounwind ssp {
+entry:
+  ; CHECK: @test
+  %j.addr = alloca i32, align 4
+  store i32 %j, i32* %j.addr, align 4, !tbaa !0
+  %tmp = bitcast i32* %j.addr to i8*
+  ; CHECK: prfum pldl1strm
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 0, i32 1)
+  ; CHECK: prfum pldl3keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 1, i32 1)
+  ; CHECK: prfum pldl2keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 2, i32 1)
+  ; CHECK: prfum pldl1keep
+  call void @llvm.prefetch(i8* %tmp, i32 0, i32 3, i32 1)
+
+  ; CHECK: prfum pstl1strm
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 0, i32 1)
+  ; CHECK: prfum pstl3keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 1, i32 1)
+  ; CHECK: prfum pstl2keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 2, i32 1)
+  ; CHECK: prfum pstl1keep
+  call void @llvm.prefetch(i8* %tmp, i32 1, i32 3, i32 1)
+
+  %tmp1 = load i32* %j.addr, align 4, !tbaa !0
+  %add = add nsw i32 %tmp1, %i
+  %idxprom = sext i32 %add to i64
+  %tmp2 = load i32** @a, align 8, !tbaa !3
+  %arrayidx = getelementptr inbounds i32* %tmp2, i64 %idxprom
+  %tmp3 = bitcast i32* %arrayidx to i8*
+
+  ; CHECK: prfm pldl1strm
+  call void @llvm.prefetch(i8* %tmp3, i32 0, i32 0, i32 1)
+  %tmp4 = load i32** @a, align 8, !tbaa !3
+  %arrayidx3 = getelementptr inbounds i32* %tmp4, i64 %idxprom
+  %tmp5 = bitcast i32* %arrayidx3 to i8*
+
+  ; CHECK: prfm pldl3keep
+  call void @llvm.prefetch(i8* %tmp5, i32 0, i32 1, i32 1)
+  %tmp6 = load i32** @a, align 8, !tbaa !3
+  %arrayidx6 = getelementptr inbounds i32* %tmp6, i64 %idxprom
+  %tmp7 = bitcast i32* %arrayidx6 to i8*
+
+  ; CHECK: prfm pldl2keep
+  call void @llvm.prefetch(i8* %tmp7, i32 0, i32 2, i32 1)
+  %tmp8 = load i32** @a, align 8, !tbaa !3
+  %arrayidx9 = getelementptr inbounds i32* %tmp8, i64 %idxprom
+  %tmp9 = bitcast i32* %arrayidx9 to i8*
+
+  ; CHECK: prfm pldl1keep
+  call void @llvm.prefetch(i8* %tmp9, i32 0, i32 3, i32 1)
+  %tmp10 = load i32** @a, align 8, !tbaa !3
+  %arrayidx12 = getelementptr inbounds i32* %tmp10, i64 %idxprom
+  %tmp11 = bitcast i32* %arrayidx12 to i8*
+
+  ; CHECK: prfm pstl1strm
+  call void @llvm.prefetch(i8* %tmp11, i32 1, i32 0, i32 1)
+  %tmp12 = load i32** @a, align 8, !tbaa !3
+  %arrayidx15 = getelementptr inbounds i32* %tmp12, i64 %idxprom
+  %tmp13 = bitcast i32* %arrayidx15 to i8*
+
+  ; CHECK: prfm pstl3keep
+  call void @llvm.prefetch(i8* %tmp13, i32 1, i32 1, i32 1)
+  %tmp14 = load i32** @a, align 8, !tbaa !3
+  %arrayidx18 = getelementptr inbounds i32* %tmp14, i64 %idxprom
+  %tmp15 = bitcast i32* %arrayidx18 to i8*
+
+  ; CHECK: prfm pstl2keep
+  call void @llvm.prefetch(i8* %tmp15, i32 1, i32 2, i32 1)
+  %tmp16 = load i32** @a, align 8, !tbaa !3
+  %arrayidx21 = getelementptr inbounds i32* %tmp16, i64 %idxprom
+  %tmp17 = bitcast i32* %arrayidx21 to i8*
+
+  ; CHECK: prfm pstl1keep
+  call void @llvm.prefetch(i8* %tmp17, i32 1, i32 3, i32 1)
+  ret void
+}
+
+declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) nounwind
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !"any pointer", metadata !1}
diff --git a/test/CodeGen/ARM64/promote-const.ll b/test/CodeGen/ARM64/promote-const.ll
new file mode 100644
index 0000000000..4a336dbf45
--- /dev/null
+++ b/test/CodeGen/ARM64/promote-const.ll
@@ -0,0 +1,255 @@
+; Disable machine cse to stress the different path of the algorithm.
+; Otherwise, we always fall in the simple case, i.e., only one definition.
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-stress-promote-const | FileCheck -check-prefix=PROMOTED %s
+; The REGULAR run just checks that the inputs passed to promote const expose
+; the appropriate patterns.
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -disable-machine-cse -arm64-promote-const=false | FileCheck -check-prefix=REGULAR %s
+
+%struct.uint8x16x4_t = type { [4 x <16 x i8>] }
+
+; Constant is a structure
+define %struct.uint8x16x4_t @test1() {
+; PROMOTED-LABEL: test1:
+; Promote constant has created a big constant for the whole structure
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], __PromotedConst@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], __PromotedConst@PAGEOFF
+; Destination registers are defined by the ABI
+; PROMOTED-NEXT: ldp q0, q1, {{\[}}[[BASEADDR]]]
+; PROMOTED-NEXT: ldp q2, q3, {{\[}}[[BASEADDR]], #32]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test1:
+; Regular access is quite bad, it performs 4 loads, one for each chunk of
+; the structure
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; Destination registers are defined by the ABI
+; REGULAR: ldr q0, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR: ldr q1, {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR2:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
+; REGULAR: ldr q2, {{\[}}[[PAGEADDR2]], [[CSTLABEL2]]@PAGEOFF]
+; REGULAR: adrp [[PAGEADDR3:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
+; REGULAR: ldr q3, {{\[}}[[PAGEADDR3]], [[CSTLABEL3]]@PAGEOFF]
+; REGULAR-NEXT: ret
+entry:
+  ret %struct.uint8x16x4_t { [4 x <16 x i8>] [<16 x i8> <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, <16 x i8> <i8 32, i8 124, i8 121, i8 120, i8 8, i8 117, i8 -56, i8 113, i8 -76, i8 110, i8 -53, i8 107, i8 7, i8 105, i8 103, i8 102>, <16 x i8> <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>, <16 x i8> <i8 -104, i8 83, i8 -20, i8 81, i8 81, i8 80, i8 -59, i8 78, i8 73, i8 77, i8 -37, i8 75, i8 122, i8 74, i8 37, i8 73>] }
+}
+
+; Two different uses of the same constant in the same basic block
+define <16 x i8> @test2(<16 x i8> %arg) {
+entry:
+; PROMOTED-LABEL: test2:
+; In stress mode, constant vector are promoted
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1:__PromotedConst[0-9]+]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: mla.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test2:
+; Regular access is strickly the same as promoted access.
+; The difference is that the address (and thus the space in memory) is not
+; shared between constants
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: mla.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: ret
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %add.i9 = add <16 x i8> %add.i, %mul.i
+  ret <16 x i8> %add.i9
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; one dominates the other
+define <16 x i8> @test3(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test3:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: cbnz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV2:__PromotedConst[0-9]+]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV2]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM]], {{\[}}[[BASEADDR]]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; PROMOTED-NEXT: add.16b v0, v0, [[DESTV]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test3:
+; Regular mode does not elimitate common sub expression by its own.
+; In other words, the same loads appears several times.
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: cbz w0, [[LABELelse:LBB.*]]
+; Next BB
+; Redundant load
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL1]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL1]]@PAGEOFF]
+; REGULAR-NEXT: b [[LABELend:LBB.*]]
+; Next BB
+; REGULAR-NEXT: [[LABELelse]]
+; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL2:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL2]]@PAGEOFF]
+; Next BB
+; REGULAR-NEXT: [[LABELend]]:
+; REGULAR-NEXT: mul.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; REGULAR-NEXT: add.16b v0, v0, [[DESTV]]
+; REGULAR-NEXT: ret
+entry:
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul.i13 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %mul.i = mul <16 x i8> %add.i, <i8 -24, i8 99, i8 -121, i8 97, i8 66, i8 95, i8 24, i8 93, i8 6, i8 91, i8 12, i8 89, i8 39, i8 87, i8 86, i8 85>
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %ret2.0 = phi <16 x i8> [ %mul.i13, %if.then ], [ %mul.i, %if.else ]
+  %add.i12 = add <16 x i8> %add.i, %ret2.0
+  ret <16 x i8> %add.i12
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; none dominates the other
+define <16 x i8> @test4(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test4:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; Destination register is defined by ABI
+; PROMOTED-NEXT: add.16b v0, v0, v[[REGNUM]]
+; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: mul.16b v0, v0, v[[REGNUM]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: ret
+
+
+; REGULAR-LABEL: test4:
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: add.16b v0, v0, v[[REGNUM]]
+; REGULAR-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; Redundant expression
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL3]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL3]]@PAGEOFF]
+; Destination register is defined by ABI
+; REGULAR-NEXT: mul.16b v0, v0, v[[REGNUM]]
+; Next BB
+; REGULAR-NEXT: [[LABEL]]:
+; REGULAR-NEXT: ret
+entry:
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %mul.i = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %ret.0 = phi <16 x i8> [ %mul.i, %if.then ], [ %add.i, %entry ]
+  ret <16 x i8> %ret.0
+}
+
+; Two different uses of the sane constant in two different basic blocks,
+; one is in a phi.
+define <16 x i8> @test5(<16 x i8> %arg, i32 %path) {
+; PROMOTED-LABEL: test5:
+; In stress mode, constant vector are promoted
+; Since, the constant is the same as the previous function,
+; the same address must be used
+; PROMOTED: adrp [[PAGEADDR:x[0-9]+]], [[CSTV1]]@PAGE
+; PROMOTED: add [[BASEADDR:x[0-9]+]], [[PAGEADDR]], [[CSTV1]]@PAGEOFF
+; PROMOTED-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[BASEADDR]]]
+; PROMOTED-NEXT: cbz w0, [[LABEL:LBB.*]]
+; Next BB
+; PROMOTED: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; PROMOTED-NEXT: mul.16b v[[REGNUM]], [[DESTV]], v[[REGNUM]]
+; Next BB
+; PROMOTED-NEXT: [[LABEL]]:
+; PROMOTED-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[REGNUM]], v[[REGNUM]]
+; PROMOTED-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
+; PROMOTED-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
+; PROMOTED-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
+; PROMOTED-NEXT: ret
+
+; REGULAR-LABEL: test5:
+; REGULAR: cbz w0, [[LABELelse:LBB.*]]
+; Next BB
+; REGULAR: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[REGNUM:[0-9]+]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; REGULAR-NEXT: add.16b [[DESTV:v[0-9]+]], v0, v[[REGNUM]]
+; REGULAR-NEXT: mul.16b v[[DESTREGNUM:[0-9]+]], [[DESTV]], v[[REGNUM]]
+; REGULAR-NEXT: b [[LABELend:LBB.*]]
+; Next BB
+; REGULAR-NEXT: [[LABELelse]]
+; REGULAR-NEXT: adrp [[PAGEADDR:x[0-9]+]], [[CSTLABEL:lCP.*]]@PAGE
+; REGULAR-NEXT: ldr q[[DESTREGNUM]], {{\[}}[[PAGEADDR]], [[CSTLABEL]]@PAGEOFF]
+; Next BB
+; REGULAR-NEXT: [[LABELend]]:
+; REGULAR-NEXT: mul.16b [[TMP1:v[0-9]+]], v[[DESTREGNUM]], v[[DESTREGNUM]]
+; REGULAR-NEXT: mul.16b [[TMP2:v[0-9]+]], [[TMP1]], [[TMP1]]
+; REGULAR-NEXT: mul.16b [[TMP3:v[0-9]+]], [[TMP2]], [[TMP2]]
+; REGULAR-NEXT: mul.16b v0, [[TMP3]], [[TMP3]]
+; REGULAR-NEXT: ret
+entry:
+  %tobool = icmp eq i32 %path, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %add.i = add <16 x i8> %arg, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  %mul.i26 = mul <16 x i8> %add.i, <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %ret.0 = phi <16 x i8> [ %mul.i26, %if.then ], [ <i8 -40, i8 -93, i8 -118, i8 -99, i8 -75, i8 -105, i8 74, i8 -110, i8 62, i8 -115, i8 -119, i8 -120, i8 34, i8 -124, i8 0, i8 -128>, %entry ]
+  %mul.i25 = mul <16 x i8> %ret.0, %ret.0
+  %mul.i24 = mul <16 x i8> %mul.i25, %mul.i25
+  %mul.i23 = mul <16 x i8> %mul.i24, %mul.i24
+  %mul.i = mul <16 x i8> %mul.i23, %mul.i23
+  ret <16 x i8> %mul.i
+}
+
+define void @accessBig(i64* %storage) {
+; PROMOTED-LABEL: accessBig:
+; PROMOTED: adrp
+; PROMOTED: ret
+  %addr = bitcast i64* %storage to <1 x i80>*
+  store <1 x i80> <i80 483673642326615442599424>, <1 x i80>* %addr
+  ret void
+}
+
+define void @asmStatement() {
+; PROMOTED-LABEL: asmStatement:
+; PROMOTED-NOT: adrp
+; PROMOTED: ret
+  call void asm sideeffect "bfxil w0, w0, $0, $1", "i,i"(i32 28, i32 4)
+  ret void
+}
+
diff --git a/test/CodeGen/ARM64/redzone.ll b/test/CodeGen/ARM64/redzone.ll
new file mode 100644
index 0000000000..b89d7b1de3
--- /dev/null
+++ b/test/CodeGen/ARM64/redzone.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=arm64 -arm64-redzone | FileCheck %s
+
+define i32 @foo(i32 %a, i32 %b) nounwind ssp {
+; CHECK-LABEL: foo:
+; CHECK-NOT: sub sp, sp
+; CHECK: ret
+  %a.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %x = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  store i32 %b, i32* %b.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %tmp1 = load i32* %b.addr, align 4
+  %add = add nsw i32 %tmp, %tmp1
+  store i32 %add, i32* %x, align 4
+  %tmp2 = load i32* %x, align 4
+  ret i32 %tmp2
+}
diff --git a/test/CodeGen/ARM64/register-offset-addressing.ll b/test/CodeGen/ARM64/register-offset-addressing.ll
new file mode 100644
index 0000000000..c27360257c
--- /dev/null
+++ b/test/CodeGen/ARM64/register-offset-addressing.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+
+define i8 @t1(i16* %a, i64 %b) {
+; CHECK: t1
+; CHECK: lsl [[REG:x[0-9]+]], x1, #1
+; CHECK: ldrb w0, [x0, [[REG]]]
+; CHECK: ret
+  %tmp1 = getelementptr inbounds i16* %a, i64 %b
+  %tmp2 = load i16* %tmp1
+  %tmp3 = trunc i16 %tmp2 to i8
+  ret i8 %tmp3
+}
diff --git a/test/CodeGen/ARM64/register-pairing.ll b/test/CodeGen/ARM64/register-pairing.ll
new file mode 100644
index 0000000000..4de80d2d2e
--- /dev/null
+++ b/test/CodeGen/ARM64/register-pairing.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s | FileCheck %s
+;
+; rdar://14075006
+
+define void @odd() nounwind {
+; CHECK-LABEL: odd:
+; CHECK: stp d15, d14, [sp, #-144]!
+; CHECK: stp d13, d12, [sp, #16]
+; CHECK: stp d11, d10, [sp, #32]
+; CHECK: stp d9, d8, [sp, #48]
+; CHECK: stp x28, x27, [sp, #64]
+; CHECK: stp x26, x25, [sp, #80]
+; CHECK: stp x24, x23, [sp, #96]
+; CHECK: stp x22, x21, [sp, #112]
+; CHECK: stp x20, x19, [sp, #128]
+; CHECK: movz x0, #42
+; CHECK: ldp x20, x19, [sp, #128]
+; CHECK: ldp x22, x21, [sp, #112]
+; CHECK: ldp x24, x23, [sp, #96]
+; CHECK: ldp x26, x25, [sp, #80]
+; CHECK: ldp x28, x27, [sp, #64]
+; CHECK: ldp d9, d8, [sp, #48]
+; CHECK: ldp d11, d10, [sp, #32]
+; CHECK: ldp d13, d12, [sp, #16]
+; CHECK: ldp d15, d14, [sp], #144
+  call void asm sideeffect "mov x0, #42", "~{x0},~{x19},~{x21},~{x23},~{x25},~{x27},~{d8},~{d10},~{d12},~{d14}"() nounwind
+  ret void
+}
+
+define void @even() nounwind {
+; CHECK-LABEL: even:
+; CHECK: stp d15, d14, [sp, #-144]!
+; CHECK: stp d13, d12, [sp, #16]
+; CHECK: stp d11, d10, [sp, #32]
+; CHECK: stp d9, d8, [sp, #48]
+; CHECK: stp x28, x27, [sp, #64]
+; CHECK: stp x26, x25, [sp, #80]
+; CHECK: stp x24, x23, [sp, #96]
+; CHECK: stp x22, x21, [sp, #112]
+; CHECK: stp x20, x19, [sp, #128]
+; CHECK: movz x0, #42
+; CHECK: ldp x20, x19, [sp, #128]
+; CHECK: ldp x22, x21, [sp, #112]
+; CHECK: ldp x24, x23, [sp, #96]
+; CHECK: ldp x26, x25, [sp, #80]
+; CHECK: ldp x28, x27, [sp, #64]
+; CHECK: ldp d9, d8, [sp, #48]
+; CHECK: ldp d11, d10, [sp, #32]
+; CHECK: ldp d13, d12, [sp, #16]
+; CHECK: ldp d15, d14, [sp], #144
+  call void asm sideeffect "mov x0, #42", "~{x0},~{x20},~{x22},~{x24},~{x26},~{x28},~{d9},~{d11},~{d13},~{d15}"() nounwind
+  ret void
+}
diff --git a/test/CodeGen/ARM64/regress-f128csel-flags.ll b/test/CodeGen/ARM64/regress-f128csel-flags.ll
new file mode 100644
index 0000000000..a1daf03f4f
--- /dev/null
+++ b/test/CodeGen/ARM64/regress-f128csel-flags.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=arm64 -verify-machineinstrs < %s | FileCheck %s
+
+; We used to not mark NZCV as being used in the continuation basic-block
+; when lowering a 128-bit "select" to branches. This meant a subsequent use
+; of the same flags gave an internal fault here.
+
+declare void @foo(fp128)
+
+define double @test_f128csel_flags(i32 %lhs, fp128 %a, fp128 %b, double %l, double %r) nounwind {
+; CHECK: test_f128csel_flags
+
+    %tst = icmp ne i32 %lhs, 42
+    %val = select i1 %tst, fp128 %a, fp128 %b
+; CHECK: cmp w0, #42
+; CHECK: b.eq {{.?LBB0}}
+
+    call void @foo(fp128 %val)
+    %retval = select i1 %tst, double %l, double %r
+
+    ; It's also reasonably important that the actual fcsel comes before the
+    ; function call since bl may corrupt NZCV. We were doing the right thing anyway,
+    ; but just as well test it while we're here.
+; CHECK: fcsel {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, ne
+; CHECK: bl {{_?foo}}
+
+    ret double %retval
+}
diff --git a/test/CodeGen/ARM64/return-vector.ll b/test/CodeGen/ARM64/return-vector.ll
new file mode 100644
index 0000000000..9457d8bc6d
--- /dev/null
+++ b/test/CodeGen/ARM64/return-vector.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+; 2x64 vector should be returned in Q0.
+
+define <2 x double> @test(<2 x double>* %p) nounwind {
+; CHECK: test
+; CHECK: ldr q0, [x0]
+; CHECK: ret
+  %tmp1 = load <2 x double>* %p, align 16
+  ret <2 x double> %tmp1
+}
diff --git a/test/CodeGen/ARM64/returnaddr.ll b/test/CodeGen/ARM64/returnaddr.ll
new file mode 100644
index 0000000000..e06ce9072e
--- /dev/null
+++ b/test/CodeGen/ARM64/returnaddr.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i8* @rt0(i32 %x) nounwind readnone {
+entry:
+; CHECK-LABEL: rt0:
+; CHECK: mov x0, lr
+; CHECK: ret
+  %0 = tail call i8* @llvm.returnaddress(i32 0)
+  ret i8* %0
+}
+
+define i8* @rt2() nounwind readnone {
+entry:
+; CHECK-LABEL: rt2:
+; CHECK: stp fp, lr, [sp, #-16]!
+; CHECK: mov fp, sp
+; CHECK: ldr x[[REG:[0-9]+]], [fp]
+; CHECK: ldr x[[REG2:[0-9]+]], [x[[REG]]]
+; CHECK: ldr x0, [x[[REG2]], #8]
+; CHECK: ldp fp, lr, [sp], #16
+; CHECK: ret
+  %0 = tail call i8* @llvm.returnaddress(i32 2)
+  ret i8* %0
+}
+
+declare i8* @llvm.returnaddress(i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/rev.ll b/test/CodeGen/ARM64/rev.ll
new file mode 100644
index 0000000000..867d5b3c51
--- /dev/null
+++ b/test/CodeGen/ARM64/rev.ll
@@ -0,0 +1,221 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @test_rev_w(i32 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev_w:
+; CHECK: rev w0, w0
+  %0 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %0
+}
+
+define i64 @test_rev_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev_x:
+; CHECK: rev x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %0
+}
+
+declare i32 @llvm.bswap.i32(i32) nounwind readnone
+declare i64 @llvm.bswap.i64(i64) nounwind readnone
+
+define i32 @test_rev16_w(i32 %X) nounwind {
+entry:
+; CHECK-LABEL: test_rev16_w:
+; CHECK: rev16 w0, w0
+  %tmp1 = lshr i32 %X, 8
+  %X15 = bitcast i32 %X to i32
+  %tmp4 = shl i32 %X15, 8
+  %tmp2 = and i32 %tmp1, 16711680
+  %tmp5 = and i32 %tmp4, -16777216
+  %tmp9 = and i32 %tmp1, 255
+  %tmp13 = and i32 %tmp4, 65280
+  %tmp6 = or i32 %tmp5, %tmp2
+  %tmp10 = or i32 %tmp6, %tmp13
+  %tmp14 = or i32 %tmp10, %tmp9
+  ret i32 %tmp14
+}
+
+define i64 @test_rev16_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev16_x:
+; CHECK: rev16 x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %1 = lshr i64 %0, 16
+  %2 = shl i64 %0, 48
+  %3 = or i64 %1, %2
+  ret i64 %3
+}
+
+define i64 @test_rev32_x(i64 %a) nounwind {
+entry:
+; CHECK-LABEL: test_rev32_x:
+; CHECK: rev32 x0, x0
+  %0 = tail call i64 @llvm.bswap.i64(i64 %a)
+  %1 = lshr i64 %0, 32
+  %2 = shl i64 %0, 32
+  %3 = or i64 %1, %2
+  ret i64 %3
+}
+
+define <8 x i8> @test_vrev64D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D8:
+;CHECK: rev64.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @test_vrev64D16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D16:
+;CHECK: rev64.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+	ret <4 x i16> %tmp2
+}
+
+define <2 x i32> @test_vrev64D32(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D32:
+;CHECK: rev64.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x i32> %tmp2
+}
+
+define <2 x float> @test_vrev64Df(<2 x float>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Df:
+;CHECK: rev64.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+	ret <2 x float> %tmp2
+}
+
+define <16 x i8> @test_vrev64Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q8:
+;CHECK: rev64.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev64Q16(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q16:
+;CHECK: rev64.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @test_vrev64Q32(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Q32:
+;CHECK: rev64.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i32> %tmp2
+}
+
+define <4 x float> @test_vrev64Qf(<4 x float>* %A) nounwind {
+;CHECK-LABEL: test_vrev64Qf:
+;CHECK: rev64.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x float> %tmp2
+}
+
+define <8 x i8> @test_vrev32D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev32D8:
+;CHECK: rev32.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+	ret <8 x i8> %tmp2
+}
+
+define <4 x i16> @test_vrev32D16(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32D16:
+;CHECK: rev32.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+	ret <4 x i16> %tmp2
+}
+
+define <16 x i8> @test_vrev32Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q8:
+;CHECK: rev32.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+	ret <16 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev32Q16(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q16:
+;CHECK: rev32.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i16> %tmp2
+}
+
+define <8 x i8> @test_vrev16D8(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev16D8:
+;CHECK: rev16.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+	ret <8 x i8> %tmp2
+}
+
+define <16 x i8> @test_vrev16Q8(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev16Q8:
+;CHECK: rev16.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+	ret <16 x i8> %tmp2
+}
+
+; Undef shuffle indices should not prevent matching to VREV:
+
+define <8 x i8> @test_vrev64D8_undef(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: test_vrev64D8_undef:
+;CHECK: rev64.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 7, i32 undef, i32 undef, i32 4, i32 3, i32 2, i32 1, i32 0>
+	ret <8 x i8> %tmp2
+}
+
+define <8 x i16> @test_vrev32Q16_undef(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: test_vrev32Q16_undef:
+;CHECK: rev32.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <8 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 5, i32 4, i32 7, i32 undef>
+	ret <8 x i16> %tmp2
+}
+
+; vrev <4 x i16> should use REV32 and not REV64
+define void @test_vrev64(<4 x i16>* nocapture %source, <2 x i16>* nocapture %dst) nounwind ssp {
+; CHECK-LABEL: test_vrev64:
+; CHECK: ldr [[DEST:q[0-9]+]],
+; CHECK: st1.h
+; CHECK: st1.h
+entry:
+  %0 = bitcast <4 x i16>* %source to <8 x i16>*
+  %tmp2 = load <8 x i16>* %0, align 4
+  %tmp3 = extractelement <8 x i16> %tmp2, i32 6
+  %tmp5 = insertelement <2 x i16> undef, i16 %tmp3, i32 0
+  %tmp9 = extractelement <8 x i16> %tmp2, i32 5
+  %tmp11 = insertelement <2 x i16> %tmp5, i16 %tmp9, i32 1
+  store <2 x i16> %tmp11, <2 x i16>* %dst, align 4
+  ret void
+}
+
+; Test vrev of float4
+define void @float_vrev64(float* nocapture %source, <4 x float>* nocapture %dest) nounwind noinline ssp {
+; CHECK: float_vrev64
+; CHECK: ldr [[DEST:q[0-9]+]],
+; CHECK: rev64.4s
+entry:
+  %0 = bitcast float* %source to <4 x float>*
+  %tmp2 = load <4 x float>* %0, align 4
+  %tmp5 = shufflevector <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x float> %tmp2, <4 x i32> <i32 0, i32 7, i32 0, i32 0>
+  %arrayidx8 = getelementptr inbounds <4 x float>* %dest, i32 11
+  store <4 x float> %tmp5, <4 x float>* %arrayidx8, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/ARM64/rounding.ll b/test/CodeGen/ARM64/rounding.ll
new file mode 100644
index 0000000000..7ff65c3737
--- /dev/null
+++ b/test/CodeGen/ARM64/rounding.ll
@@ -0,0 +1,208 @@
+; RUN: llc -O3 < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-ios6.0.0"
+
+; CHECK: test1
+; CHECK: frintx
+; CHECK: frintm
+define float @test1(float %a) #0 {
+entry:
+  %call = tail call float @floorf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @floorf(float) nounwind readnone
+
+; CHECK: test2
+; CHECK: frintx
+; CHECK: frintm
+define double @test2(double %a) #0 {
+entry:
+  %call = tail call double @floor(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @floor(double) nounwind readnone
+
+; CHECK: test3
+; CHECK: frinti
+define float @test3(float %a) #0 {
+entry:
+  %call = tail call float @nearbyintf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @nearbyintf(float) nounwind readnone
+
+; CHECK: test4
+; CHECK: frinti
+define double @test4(double %a) #0 {
+entry:
+  %call = tail call double @nearbyint(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @nearbyint(double) nounwind readnone
+
+; CHECK: test5
+; CHECK: frintx
+; CHECK: frintp
+define float @test5(float %a) #0 {
+entry:
+  %call = tail call float @ceilf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @ceilf(float) nounwind readnone
+
+; CHECK: test6
+; CHECK: frintx
+; CHECK: frintp
+define double @test6(double %a) #0 {
+entry:
+  %call = tail call double @ceil(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @ceil(double) nounwind readnone
+
+; CHECK: test7
+; CHECK: frintx
+define float @test7(float %a) #0 {
+entry:
+  %call = tail call float @rintf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @rintf(float) nounwind readnone
+
+; CHECK: test8
+; CHECK: frintx
+define double @test8(double %a) #0 {
+entry:
+  %call = tail call double @rint(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @rint(double) nounwind readnone
+
+; CHECK: test9
+; CHECK: frintx
+; CHECK: frintz
+define float @test9(float %a) #0 {
+entry:
+  %call = tail call float @truncf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @truncf(float) nounwind readnone
+
+; CHECK: test10
+; CHECK: frintx
+; CHECK: frintz
+define double @test10(double %a) #0 {
+entry:
+  %call = tail call double @trunc(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @trunc(double) nounwind readnone
+
+; CHECK: test11
+; CHECK: frintx
+; CHECK: frinta
+define float @test11(float %a) #0 {
+entry:
+  %call = tail call float @roundf(float %a) nounwind readnone
+  ret float %call
+}
+
+declare float @roundf(float %a) nounwind readnone
+
+; CHECK: test12
+; CHECK: frintx
+; CHECK: frinta
+define double @test12(double %a) #0 {
+entry:
+  %call = tail call double @round(double %a) nounwind readnone
+  ret double %call
+}
+
+declare double @round(double %a) nounwind readnone
+
+; CHECK: test13
+; CHECK-NOT: frintx
+; CHECK: frintm
+define float @test13(float %a) #1 {
+entry:
+  %call = tail call float @floorf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test14
+; CHECK-NOT: frintx
+; CHECK: frintm
+define double @test14(double %a) #1 {
+entry:
+  %call = tail call double @floor(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test15
+; CHECK-NOT: frintx
+; CHECK: frintp
+define float @test15(float %a) #1 {
+entry:
+  %call = tail call float @ceilf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test16
+; CHECK-NOT: frintx
+; CHECK: frintp
+define double @test16(double %a) #1 {
+entry:
+  %call = tail call double @ceil(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test17
+; CHECK-NOT: frintx
+; CHECK: frintz
+define float @test17(float %a) #1 {
+entry:
+  %call = tail call float @truncf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test18
+; CHECK-NOT: frintx
+; CHECK: frintz
+define double @test18(double %a) #1 {
+entry:
+  %call = tail call double @trunc(double %a) nounwind readnone
+  ret double %call
+}
+
+; CHECK: test19
+; CHECK-NOT: frintx
+; CHECK: frinta
+define float @test19(float %a) #1 {
+entry:
+  %call = tail call float @roundf(float %a) nounwind readnone
+  ret float %call
+}
+
+; CHECK: test20
+; CHECK-NOT: frintx
+; CHECK: frinta
+define double @test20(double %a) #1 {
+entry:
+  %call = tail call double @round(double %a) nounwind readnone
+  ret double %call
+}
+
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/ARM64/scaled_iv.ll b/test/CodeGen/ARM64/scaled_iv.ll
new file mode 100644
index 0000000000..987373e542
--- /dev/null
+++ b/test/CodeGen/ARM64/scaled_iv.ll
@@ -0,0 +1,38 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+; Scaling factor in addressing mode are costly.
+; Make loop-reduce prefer unscaled accesses.
+; <rdar://problem/13806271>
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+; Function Attrs: nounwind ssp
+define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) {
+; CHECK: @mulDouble
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
+; Only one induction variable should have been generated.
+; CHECK-NOT: phi
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = add nsw i64 %indvars.iv, -1
+  %arrayidx = getelementptr inbounds double* %b, i64 %tmp
+  %tmp1 = load double* %arrayidx, align 8
+; The induction variable should carry the scaling factor: 1 * 8 = 8.
+; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 8
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next
+  %tmp2 = load double* %arrayidx2, align 8
+  %mul = fmul double %tmp1, %tmp2
+  %arrayidx4 = getelementptr inbounds double* %a, i64 %indvars.iv
+  store double %mul, double* %arrayidx4, align 8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; Comparison should be 19 * 8 = 152.
+; CHECK: icmp eq i32 {{%[^,]+}}, 152
+  %exitcond = icmp eq i32 %lftr.wideiv, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/CodeGen/ARM64/scvt.ll b/test/CodeGen/ARM64/scvt.ll
new file mode 100644
index 0000000000..b4d4add1e8
--- /dev/null
+++ b/test/CodeGen/ARM64/scvt.ll
@@ -0,0 +1,830 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+; rdar://13082402
+
+define float @t1(i32* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldr s0, [x0]
+; CHECK: scvtf s0, s0
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = sitofp i32 %tmp1 to float
+  ret float %tmp2
+}
+
+define float @t2(i32* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldr s0, [x0]
+; CHECK: ucvtf s0, s0
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = uitofp i32 %tmp1 to float
+  ret float %tmp2
+}
+
+define double @t3(i64* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: ldr d0, [x0]
+; CHECK: scvtf d0, d0
+  %tmp1 = load i64* %src, align 4
+  %tmp2 = sitofp i64 %tmp1 to double
+  ret double %tmp2
+}
+
+define double @t4(i64* nocapture %src) nounwind ssp {
+entry:
+; CHECK-LABEL: t4:
+; CHECK: ldr d0, [x0]
+; CHECK: ucvtf d0, d0
+  %tmp1 = load i64* %src, align 4
+  %tmp2 = uitofp i64 %tmp1 to double
+  ret double %tmp2
+}
+
+; rdar://13136456
+define double @t5(i32* nocapture %src) nounwind ssp optsize {
+entry:
+; CHECK-LABEL: t5:
+; CHECK: ldr [[REG:w[0-9]+]], [x0]
+; CHECK: scvtf d0, [[REG]]
+  %tmp1 = load i32* %src, align 4
+  %tmp2 = sitofp i32 %tmp1 to double
+  ret double %tmp2
+}
+
+; Check that we load in FP register when we want to convert into
+; floating point value.
+; This is much faster than loading on GPR and making the conversion
+; GPR -> FPR.
+; <rdar://problem/14599607>
+;
+; Check the flollowing patterns for signed/unsigned:
+; 1. load with scaled imm to float.
+; 2. load with scaled register to float.
+; 3. load with scaled imm to double.
+; 4. load with scaled register to double.
+; 5. load with unscaled imm to float.
+; 6. load with unscaled imm to double.
+; With loading size: 8, 16, 32, and 64-bits.
+
+; ********* 1. load with scaled imm to float. *********
+define float @fct1(i8* nocapture %sp0) {
+; CHECK-LABEL: fct1:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct2(i16* nocapture %sp0) {
+; CHECK-LABEL: fct2:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct3(i32* nocapture %sp0) {
+; CHECK-LABEL: fct3:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct4(i64* nocapture %sp0) {
+; CHECK-LABEL: fct4:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 2. load with scaled register to float. *********
+define float @fct5(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct5:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct6(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct6:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct7(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct7:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct8(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct8:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+
+; ********* 3. load with scaled imm to double. *********
+define double @fct9(i8* nocapture %sp0) {
+; CHECK-LABEL: fct9:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct10(i16* nocapture %sp0) {
+; CHECK-LABEL: fct10:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct11(i32* nocapture %sp0) {
+; CHECK-LABEL: fct11:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct12(i64* nocapture %sp0) {
+; CHECK-LABEL: fct12:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 4. load with scaled register to double. *********
+define double @fct13(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct13:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct14(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct14:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct15(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct15:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct16(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct16:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 5. load with unscaled imm to float. *********
+define float @fct17(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: fct17:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct18(i16* nocapture %sp0) {
+; CHECK-LABEL: fct18:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @fct19(i32* nocapture %sp0) {
+; CHECK-LABEL: fct19:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], s[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @fct20(i64* nocapture %sp0) {
+; CHECK-LABEL: fct20:
+; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+
+}
+
+; ********* 6. load with unscaled imm to double. *********
+define double @fct21(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: fct21:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = uitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct22(i16* nocapture %sp0) {
+; CHECK-LABEL: fct22:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = uitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct23(i32* nocapture %sp0) {
+; CHECK-LABEL: fct23:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = uitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @fct24(i64* nocapture %sp0) {
+; CHECK-LABEL: fct24:
+; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: ucvtf [[REG:d[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = uitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+
+}
+
+; ********* 1s. load with scaled imm to float. *********
+define float @sfct1(i8* nocapture %sp0) {
+; CHECK-LABEL: sfct1:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct2(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct2:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct3(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct3:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct4(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct4:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 2s. load with scaled register to float. *********
+define float @sfct5(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct5:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct6(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct6:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct7(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct7:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct8(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct8:
+; CHECK: ldr x[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; ********* 3s. load with scaled imm to double. *********
+define double @sfct9(i8* nocapture %sp0) {
+; CHECK-LABEL: sfct9:
+; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct10(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct10:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct11(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct11:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct12(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct12:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 4s. load with scaled register to double. *********
+define double @sfct13(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct13:
+; CHECK: ldrsb w[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct14(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct14:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct15(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct15:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct16(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: sfct16:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, x1, lsl #3]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; ********* 5s. load with unscaled imm to float. *********
+define float @sfct17(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: sfct17:
+; CHECK: ldur b[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: sshll.8h [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct18(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct18:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.4s v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define float @sfct19(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct19:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], s[[SEXTREG]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+; i64 -> f32 is not supported on floating point unit.
+define float @sfct20(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct20:
+; CHECK: ldur x[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], x[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+
+}
+
+; ********* 6s. load with unscaled imm to double. *********
+define double @sfct21(i8* nocapture %sp0) {
+entry:
+; CHECK-LABEL: sfct21:
+; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct22(i16* nocapture %sp0) {
+; CHECK-LABEL: sfct22:
+; CHECK: ldur h[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.4s [[SEXTREG1:v[0-9]+]], v[[REGNUM]], #0
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], [[SEXTREG1]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i16* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i16*
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %val = sitofp i16 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct23(i32* nocapture %sp0) {
+; CHECK-LABEL: sfct23:
+; CHECK: ldur s[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: sshll.2d v[[SEXTREG:[0-9]+]], v[[REGNUM]], #0
+; CHECK: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i32* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i32*
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+define double @sfct24(i64* nocapture %sp0) {
+; CHECK-LABEL: sfct24:
+; CHECK: ldur d[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], d[[SEXTREG]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i64* %sp0 to i64
+  %add = add i64 %bitcast, 1
+  %addr = inttoptr i64 %add to i64*
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %val = sitofp i64 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+
+}
+
+; Check that we do not use SSHLL code sequence when code size is a concern.
+define float @codesize_sfct17(i8* nocapture %sp0) optsize {
+entry:
+; CHECK-LABEL: codesize_sfct17:
+; CHECK: ldursb w[[REGNUM:[0-9]+]], [x0, #-1]
+; CHECK-NEXT: scvtf [[REG:s[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul s0, [[REG]], [[REG]]
+  %bitcast = ptrtoint i8* %sp0 to i64
+  %add = add i64 %bitcast, -1
+  %addr = inttoptr i64 %add to i8*
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %val = sitofp i8 %pix_sp0.0.copyload to float
+  %vmull.i = fmul float %val, %val
+  ret float %vmull.i
+}
+
+define double @codesize_sfct11(i32* nocapture %sp0) minsize {
+; CHECK-LABEL: sfct11:
+; CHECK: ldr w[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: scvtf [[REG:d[0-9]+]], w[[REGNUM]]
+; CHECK-NEXT: fmul d0, [[REG]], [[REG]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %val = sitofp i32 %pix_sp0.0.copyload to double
+  %vmull.i = fmul double %val, %val
+  ret double %vmull.i
+}
+
+; Adding fp128 custom lowering makes these a little fragile since we have to
+; return the correct mix of Legal/Expand from the custom method.
+;
+; rdar://problem/14991489
+
+define float @float_from_i128(i128 %in) {
+; CHECK-LABEL: float_from_i128:
+; CHECK: bl {{_?__floatuntisf}}
+  %conv = uitofp i128 %in to float
+  ret float %conv
+}
+
+define double @double_from_i128(i128 %in) {
+; CHECK-LABEL: double_from_i128:
+; CHECK: bl {{_?__floattidf}}
+  %conv = sitofp i128 %in to double
+  ret double %conv
+}
+
+define fp128 @fp128_from_i128(i128 %in) {
+; CHECK-LABEL: fp128_from_i128:
+; CHECK: bl {{_?__floatuntitf}}
+  %conv = uitofp i128 %in to fp128
+  ret fp128 %conv
+}
+
+define i128 @i128_from_float(float %in) {
+; CHECK-LABEL: i128_from_float
+; CHECK: bl {{_?__fixsfti}}
+  %conv = fptosi float %in to i128
+  ret i128 %conv
+}
+
+define i128 @i128_from_double(double %in) {
+; CHECK-LABEL: i128_from_double
+; CHECK: bl {{_?__fixunsdfti}}
+  %conv = fptoui double %in to i128
+  ret i128 %conv
+}
+
+define i128 @i128_from_fp128(fp128 %in) {
+; CHECK-LABEL: i128_from_fp128
+; CHECK: bl {{_?__fixtfti}}
+  %conv = fptosi fp128 %in to i128
+  ret i128 %conv
+}
+
diff --git a/test/CodeGen/ARM64/shifted-sext.ll b/test/CodeGen/ARM64/shifted-sext.ll
new file mode 100644
index 0000000000..e553be5fcf
--- /dev/null
+++ b/test/CodeGen/ARM64/shifted-sext.ll
@@ -0,0 +1,277 @@
+; RUN: llc -march=arm64 -mtriple=arm64-apple-ios < %s | FileCheck %s
+;
+; <rdar://problem/13820218>
+
+define signext i16 @extendedLeftShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToshortBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #28, #7
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv1, 4
+  %conv2 = trunc i32 %shl to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedRightShiftcharToshortBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToshortBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #4, #7
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shr4 = lshr i32 %conv1, 4
+  %conv2 = trunc i32 %shr4 to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedLeftShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToshortBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #24, #7
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv1, 8
+  %conv2 = trunc i32 %shl to i16
+  ret i16 %conv2
+}
+
+define signext i16 @extendedRightShiftcharToshortBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToshortBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxtb [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #8
+  %inc = add i8 %a, 1
+  %conv1 = sext i8 %inc to i32
+  %shr4 = lshr i32 %conv1, 8
+  %conv2 = trunc i32 %shr4 to i16
+  ret i16 %conv2
+}
+
+define i32 @extendedLeftShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #28, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv, 4
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftcharTointBy4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #4, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shr = ashr i32 %conv, 4
+  ret i32 %shr
+}
+
+define i32 @extendedLeftShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharTointBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #24, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shl = shl nsw i32 %conv, 8
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftcharTointBy8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharTointBy8:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxtb [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i32
+  %shr = ashr i32 %conv, 8
+  ret i32 %shr
+}
+
+define i64 @extendedLeftShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #60, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftcharToint64By4(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #4, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftcharToint64By8:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #56, #7
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shl = shl nsw i64 %conv, 8
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftcharToint64By8(i8 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftcharToint64By8:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxtb x[[REG]], x[[REG]]
+; CHECK: asr x0, x[[REG]], #8
+  %inc = add i8 %a, 1
+  %conv = sext i8 %inc to i64
+  %shr = ashr i64 %conv, 8
+  ret i64 %shr
+}
+
+define i32 @extendedLeftShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #28, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shl = shl nsw i32 %conv, 4
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftshortTointBy4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortTointBy4:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sbfm w0, [[REG]], #4, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shr = ashr i32 %conv, 4
+  ret i32 %shr
+}
+
+define i32 @extendedLeftShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortTointBy16:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: lsl w0, [[REG]], #16
+  %inc = add i16 %a, 1
+  %conv2 = zext i16 %inc to i32
+  %shl = shl nuw i32 %conv2, 16
+  ret i32 %shl
+}
+
+define i32 @extendedRightShiftshortTointBy16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortTointBy16:
+; CHECK: add [[REG:w[0-9]+]], w0, #1
+; CHECK: sxth [[REG]], [[REG]]
+; CHECK: asr w0, [[REG]], #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i32
+  %shr = ashr i32 %conv, 16
+  ret i32 %shr
+}
+
+define i64 @extendedLeftShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #60, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftshortToint64By4(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #4, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftshortToint64By16:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #48, #15
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shl = shl nsw i64 %conv, 16
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftshortToint64By16(i16 signext %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftshortToint64By16:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxth x[[REG]], x[[REG]]
+; CHECK: asr x0, x[[REG]], #16
+  %inc = add i16 %a, 1
+  %conv = sext i16 %inc to i64
+  %shr = ashr i64 %conv, 16
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftintToint64By4(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftintToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #60, #31
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shl = shl nsw i64 %conv, 4
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftintToint64By4(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftintToint64By4:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sbfm x0, x[[REG]], #4, #31
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shr = ashr i64 %conv, 4
+  ret i64 %shr
+}
+
+define i64 @extendedLeftShiftintToint64By32(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedLeftShiftintToint64By32:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: lsl x0, x[[REG]], #32
+  %inc = add nsw i32 %a, 1
+  %conv2 = zext i32 %inc to i64
+  %shl = shl nuw i64 %conv2, 32
+  ret i64 %shl
+}
+
+define i64 @extendedRightShiftintToint64By32(i32 %a) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: extendedRightShiftintToint64By32:
+; CHECK: add w[[REG:[0-9]+]], w0, #1
+; CHECK: sxtw x[[REG]], x[[REG]]
+; CHECK: asr x0, x[[REG]], #32
+  %inc = add nsw i32 %a, 1
+  %conv = sext i32 %inc to i64
+  %shr = ashr i64 %conv, 32
+  ret i64 %shr
+}
diff --git a/test/CodeGen/ARM64/simd-scalar-to-vector.ll b/test/CodeGen/ARM64/simd-scalar-to-vector.ll
new file mode 100644
index 0000000000..fe0c6feddd
--- /dev/null
+++ b/test/CodeGen/ARM64/simd-scalar-to-vector.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <16 x i8> @foo(<16 x i8> %a) nounwind optsize readnone ssp {
+; CHECK: uaddlv.16b	h0, v0
+; CHECK: rshrn.8b	v0, v0, #4
+; CHECK: dup.16b	v0, v0[0]
+; CHECK: ret
+  %tmp = tail call i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp1 = trunc i32 %tmp to i16
+  %tmp2 = insertelement <8 x i16> undef, i16 %tmp1, i32 0
+  %tmp3 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp2, i32 4)
+  %tmp4 = shufflevector <8 x i8> %tmp3, <8 x i8> undef, <16 x i32> zeroinitializer
+  ret <16 x i8> %tmp4
+}
+
+declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare i32 @llvm.arm64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/simplest-elf.ll b/test/CodeGen/ARM64/simplest-elf.ll
new file mode 100644
index 0000000000..1254365b82
--- /dev/null
+++ b/test/CodeGen/ARM64/simplest-elf.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-linux-gnu -filetype=obj < %s | llvm-objdump - -r -d --triple=arm64-linux-gnu | FileCheck --check-prefix=CHECK-ELF %s
+
+define void @foo() nounwind {
+  ret void
+}
+
+  ; Check source looks ELF-like: no leading underscore, comments with //
+; CHECK: foo: // @foo
+; CHECK:     ret
+
+  ; Similarly make sure ELF output works and is vaguely sane: aarch64 target
+  ; machine with correct section & symbol names.
+; CHECK-ELF: file format ELF64-aarch64
+
+; CHECK-ELF: Disassembly of section .text
+; CHECK-ELF-LABEL: foo:
+; CHECK-ELF:    ret
diff --git a/test/CodeGen/ARM64/sincos.ll b/test/CodeGen/ARM64/sincos.ll
new file mode 100644
index 0000000000..da14f533be
--- /dev/null
+++ b/test/CodeGen/ARM64/sincos.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7 | FileCheck %s
+
+; Combine sin / cos into a single call.
+; rdar://12856873
+
+define float @test1(float %x) nounwind {
+entry:
+; CHECK-LABEL: test1:
+; CHECK: bl ___sincosf_stret
+; CHECK: fadd s0, s0, s1
+  %call = tail call float @sinf(float %x) nounwind readnone
+  %call1 = tail call float @cosf(float %x) nounwind readnone
+  %add = fadd float %call, %call1
+  ret float %add
+}
+
+define double @test2(double %x) nounwind {
+entry:
+; CHECK-LABEL: test2:
+; CHECK: bl ___sincos_stret
+; CHECK: fadd d0, d0, d1
+  %call = tail call double @sin(double %x) nounwind readnone
+  %call1 = tail call double @cos(double %x) nounwind readnone
+  %add = fadd double %call, %call1
+  ret double %add
+}
+
+declare float  @sinf(float) readonly
+declare double @sin(double) readonly
+declare float @cosf(float) readonly
+declare double @cos(double) readonly
diff --git a/test/CodeGen/ARM64/sitofp-combine-chains.ll b/test/CodeGen/ARM64/sitofp-combine-chains.ll
new file mode 100644
index 0000000000..10b433b977
--- /dev/null
+++ b/test/CodeGen/ARM64/sitofp-combine-chains.ll
@@ -0,0 +1,22 @@
+; RUN: llc -march=arm64 -o -  %s | FileCheck %s
+
+; ARM64ISelLowering.cpp was creating a new (floating-point) load for efficiency
+; but not updating chain-successors of the old one. As a result, the two memory
+; operations in this function both ended up direct successors to the EntryToken
+; and could be reordered.
+
+@var = global i32 0, align 4
+
+define float @foo() {
+; CHECK-LABEL: foo:
+  ; Load must come before we clobber @var
+; CHECK: adrp x[[VARBASE:[0-9]+]], {{_?var}}
+; CHECK: ldr [[SREG:s[0-9]+]], [x[[VARBASE]],
+; CHECK: str wzr, [x[[VARBASE]],
+
+  %val = load i32* @var, align 4
+  store i32 0, i32* @var, align 4
+
+  %fltval = sitofp i32 %val to float
+  ret float %fltval
+}
diff --git a/test/CodeGen/ARM64/sli-sri-opt.ll b/test/CodeGen/ARM64/sli-sri-opt.ll
new file mode 100644
index 0000000000..725dcd51fd
--- /dev/null
+++ b/test/CodeGen/ARM64/sli-sri-opt.ll
@@ -0,0 +1,41 @@
+; RUN: llc -arm64-shift-insert-generation=true -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define void @testLeftGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftGood:
+; CHECK: sli.16b v0, v1, #3
+  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+  %vshl_n = shl <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testLeftBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testLeftBad:
+; CHECK-NOT: sli
+  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+  %vshl_n = shl <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testRightGood(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightGood:
+; CHECK: sri.16b v0, v1, #3
+  %and.i = and <16 x i8> %src1, <i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252, i8 252>
+  %vshl_n = lshr <16 x i8> %src2, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
+
+define void @testRightBad(<16 x i8> %src1, <16 x i8> %src2, <16 x i8>* %dest) nounwind {
+; CHECK-LABEL: testRightBad:
+; CHECK-NOT: sri
+  %and.i = and <16 x i8> %src1, <i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165, i8 165>
+  %vshl_n = lshr <16 x i8> %src2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  %result = or <16 x i8> %and.i, %vshl_n
+  store <16 x i8> %result, <16 x i8>* %dest, align 16
+  ret void
+}
diff --git a/test/CodeGen/ARM64/smaxv.ll b/test/CodeGen/ARM64/smaxv.ll
new file mode 100644
index 0000000000..4f6e01b31e
--- /dev/null
+++ b/test/CodeGen/ARM64/smaxv.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vmaxv_s8(<8 x i8> %a1) {
+; CHECK: test_vmaxv_s8
+; CHECK: smaxv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vmaxv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vmaxv_s16(<4 x i16> %a1) {
+; CHECK: test_vmaxv_s16
+; CHECK: smaxv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vmaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxv_s32(<2 x i32> %a1) {
+; CHECK: test_vmaxv_s32
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: smaxp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vmaxv.i
+}
+
+define signext i8 @test_vmaxvq_s8(<16 x i8> %a1) {
+; CHECK: test_vmaxvq_s8
+; CHECK: smaxv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vmaxv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vmaxvq_s16(<8 x i16> %a1) {
+; CHECK: test_vmaxvq_s16
+; CHECK: smaxv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vmaxv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vmaxvq_s32(<4 x i32> %a1) {
+; CHECK: test_vmaxvq_s32
+; CHECK: smaxv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vmaxv.i
+}
+
+declare i32 @llvm.arm64.neon.smaxv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.arm64.neon.smaxv.i32.v8i8(<8 x i8>)
+
diff --git a/test/CodeGen/ARM64/sminv.ll b/test/CodeGen/ARM64/sminv.ll
new file mode 100644
index 0000000000..a246868d2f
--- /dev/null
+++ b/test/CodeGen/ARM64/sminv.ll
@@ -0,0 +1,74 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vminv_s8(<8 x i8> %a1) {
+; CHECK: test_vminv_s8
+; CHECK: sminv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vminv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vminv_s16(<4 x i16> %a1) {
+; CHECK: test_vminv_s16
+; CHECK: sminv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminv_s32(<2 x i32> %a1) {
+; CHECK: test_vminv_s32
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: sminp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vminv.i
+}
+
+define signext i8 @test_vminvq_s8(<16 x i8> %a1) {
+; CHECK: test_vminvq_s8
+; CHECK: sminv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vminv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vminvq_s16(<8 x i16> %a1) {
+; CHECK: test_vminvq_s16
+; CHECK: sminv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vminv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vminvq_s32(<4 x i32> %a1) {
+; CHECK: test_vminvq_s32
+; CHECK: sminv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vminv.i
+}
+
+declare i32 @llvm.arm64.neon.sminv.i32.v4i32(<4 x i32>)
+declare i32 @llvm.arm64.neon.sminv.i32.v8i16(<8 x i16>)
+declare i32 @llvm.arm64.neon.sminv.i32.v16i8(<16 x i8>)
+declare i32 @llvm.arm64.neon.sminv.i32.v2i32(<2 x i32>)
+declare i32 @llvm.arm64.neon.sminv.i32.v4i16(<4 x i16>)
+declare i32 @llvm.arm64.neon.sminv.i32.v8i8(<8 x i8>)
+
diff --git a/test/CodeGen/ARM64/spill-lr.ll b/test/CodeGen/ARM64/spill-lr.ll
new file mode 100644
index 0000000000..fb6588e6ae
--- /dev/null
+++ b/test/CodeGen/ARM64/spill-lr.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=arm64-apple-ios < %s
+@bar = common global i32 0, align 4
+
+; Leaf function which uses all callee-saved registers and allocates >= 256 bytes on the stack
+; this will cause processFunctionBeforeCalleeSavedScan() to spill LR as an additional scratch
+; register.
+;
+; This is a crash-only regression test for rdar://15124582.
+define i32 @foo(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e, i32 %f, i32 %g, i32 %h) nounwind {
+entry:
+  %stack = alloca [128 x i32], align 4
+  %0 = bitcast [128 x i32]* %stack to i8*
+  %idxprom = sext i32 %a to i64
+  %arrayidx = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom
+  store i32 %b, i32* %arrayidx, align 4
+  %1 = load volatile i32* @bar, align 4
+  %2 = load volatile i32* @bar, align 4
+  %3 = load volatile i32* @bar, align 4
+  %4 = load volatile i32* @bar, align 4
+  %5 = load volatile i32* @bar, align 4
+  %6 = load volatile i32* @bar, align 4
+  %7 = load volatile i32* @bar, align 4
+  %8 = load volatile i32* @bar, align 4
+  %9 = load volatile i32* @bar, align 4
+  %10 = load volatile i32* @bar, align 4
+  %11 = load volatile i32* @bar, align 4
+  %12 = load volatile i32* @bar, align 4
+  %13 = load volatile i32* @bar, align 4
+  %14 = load volatile i32* @bar, align 4
+  %15 = load volatile i32* @bar, align 4
+  %16 = load volatile i32* @bar, align 4
+  %17 = load volatile i32* @bar, align 4
+  %18 = load volatile i32* @bar, align 4
+  %19 = load volatile i32* @bar, align 4
+  %20 = load volatile i32* @bar, align 4
+  %idxprom1 = sext i32 %c to i64
+  %arrayidx2 = getelementptr inbounds [128 x i32]* %stack, i64 0, i64 %idxprom1
+  %21 = load i32* %arrayidx2, align 4
+  %factor = mul i32 %h, -2
+  %factor67 = mul i32 %g, -2
+  %factor68 = mul i32 %f, -2
+  %factor69 = mul i32 %e, -2
+  %factor70 = mul i32 %d, -2
+  %factor71 = mul i32 %c, -2
+  %factor72 = mul i32 %b, -2
+  %sum = add i32 %2, %1
+  %sum73 = add i32 %sum, %3
+  %sum74 = add i32 %sum73, %4
+  %sum75 = add i32 %sum74, %5
+  %sum76 = add i32 %sum75, %6
+  %sum77 = add i32 %sum76, %7
+  %sum78 = add i32 %sum77, %8
+  %sum79 = add i32 %sum78, %9
+  %sum80 = add i32 %sum79, %10
+  %sum81 = add i32 %sum80, %11
+  %sum82 = add i32 %sum81, %12
+  %sum83 = add i32 %sum82, %13
+  %sum84 = add i32 %sum83, %14
+  %sum85 = add i32 %sum84, %15
+  %sum86 = add i32 %sum85, %16
+  %sum87 = add i32 %sum86, %17
+  %sum88 = add i32 %sum87, %18
+  %sum89 = add i32 %sum88, %19
+  %sum90 = add i32 %sum89, %20
+  %sub15 = sub i32 %21, %sum90
+  %sub16 = add i32 %sub15, %factor
+  %sub17 = add i32 %sub16, %factor67
+  %sub18 = add i32 %sub17, %factor68
+  %sub19 = add i32 %sub18, %factor69
+  %sub20 = add i32 %sub19, %factor70
+  %sub21 = add i32 %sub20, %factor71
+  %add = add i32 %sub21, %factor72
+  ret i32 %add
+}
diff --git a/test/CodeGen/ARM64/spill.ll b/test/CodeGen/ARM64/spill.ll
new file mode 100644
index 0000000000..9173c87c5f
--- /dev/null
+++ b/test/CodeGen/ARM64/spill.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs
+
+; CHECK: fpr128
+; CHECK: ld1.2d
+; CHECK: str q
+; CHECK: inlineasm
+; CHECK: ldr q
+; CHECK: st1.2d
+define void @fpr128(<4 x float>* %p) nounwind ssp {
+entry:
+  %x = load <4 x float>* %p, align 16
+  call void asm sideeffect "; inlineasm", "~{q0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7},~{q8},~{q9},~{q10},~{q11},~{q12},~{q13},~{q14},~{q15},~{q16},~{q17},~{q18},~{q19},~{q20},~{q21},~{q22},~{q23},~{q24},~{q25},~{q26},~{q27},~{q28},~{q29},~{q30},~{q31},~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
+  store <4 x float> %x, <4 x float>* %p, align 16
+  ret void
+}
diff --git a/test/CodeGen/ARM64/st1.ll b/test/CodeGen/ARM64/st1.ll
new file mode 100644
index 0000000000..3c0d3ecc04
--- /dev/null
+++ b/test/CodeGen/ARM64/st1.ll
@@ -0,0 +1,628 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+define void @st1lane_16b(<16 x i8> %A, i8* %D) {
+; CHECK: st1lane_16b
+; CHECK: st1.b
+  %tmp = extractelement <16 x i8> %A, i32 1
+  store i8 %tmp, i8* %D
+  ret void
+}
+
+define void @st1lane_8h(<8 x i16> %A, i16* %D) {
+; CHECK: st1lane_8h
+; CHECK: st1.h
+  %tmp = extractelement <8 x i16> %A, i32 1
+  store i16 %tmp, i16* %D
+  ret void
+}
+
+define void @st1lane_4s(<4 x i32> %A, i32* %D) {
+; CHECK: st1lane_4s
+; CHECK: st1.s
+  %tmp = extractelement <4 x i32> %A, i32 1
+  store i32 %tmp, i32* %D
+  ret void
+}
+
+define void @st1lane_2d(<2 x i64> %A, i64* %D) {
+; CHECK: st1lane_2d
+; CHECK: st1.d
+  %tmp = extractelement <2 x i64> %A, i32 1
+  store i64 %tmp, i64* %D
+  ret void
+}
+
+define void @st2lane_16b(<16 x i8> %A, <16 x i8> %B, i8* %D) {
+; CHECK: st2lane_16b
+; CHECK: st2.b
+  call void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i64 1, i8* %D)
+  ret void
+}
+
+define void @st2lane_8h(<8 x i16> %A, <8 x i16> %B, i16* %D) {
+; CHECK: st2lane_8h
+; CHECK: st2.h
+  call void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i64 1, i16* %D)
+  ret void
+}
+
+define void @st2lane_4s(<4 x i32> %A, <4 x i32> %B, i32* %D) {
+; CHECK: st2lane_4s
+; CHECK: st2.s
+  call void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i64 1, i32* %D)
+  ret void
+}
+
+define void @st2lane_2d(<2 x i64> %A, <2 x i64> %B, i64* %D) {
+; CHECK: st2lane_2d
+; CHECK: st2.d
+  call void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64 1, i64* %D)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st2lane.v16i8.p0i8(<16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.arm64.neon.st2lane.v8i16.p0i16(<8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.arm64.neon.st2lane.v4i32.p0i32(<4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.arm64.neon.st2lane.v2i64.p0i64(<2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st3lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %D) {
+; CHECK: st3lane_16b
+; CHECK: st3.b
+  call void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i64 1, i8* %D)
+  ret void
+}
+
+define void @st3lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %D) {
+; CHECK: st3lane_8h
+; CHECK: st3.h
+  call void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i64 1, i16* %D)
+  ret void
+}
+
+define void @st3lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %D) {
+; CHECK: st3lane_4s
+; CHECK: st3.s
+  call void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i64 1, i32* %D)
+  ret void
+}
+
+define void @st3lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %D) {
+; CHECK: st3lane_2d
+; CHECK: st3.d
+  call void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64 1, i64* %D)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st3lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.arm64.neon.st3lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.arm64.neon.st3lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.arm64.neon.st3lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+define void @st4lane_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %E) {
+; CHECK: st4lane_16b
+; CHECK: st4.b
+  call void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i64 1, i8* %E)
+  ret void
+}
+
+define void @st4lane_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %E) {
+; CHECK: st4lane_8h
+; CHECK: st4.h
+  call void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i64 1, i16* %E)
+  ret void
+}
+
+define void @st4lane_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %E) {
+; CHECK: st4lane_4s
+; CHECK: st4.s
+  call void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i64 1, i32* %E)
+  ret void
+}
+
+define void @st4lane_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %E) {
+; CHECK: st4lane_2d
+; CHECK: st4.d
+  call void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64 1, i64* %E)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st4lane.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i64, i8*) nounwind readnone
+declare void @llvm.arm64.neon.st4lane.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i64, i16*) nounwind readnone
+declare void @llvm.arm64.neon.st4lane.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i64, i32*) nounwind readnone
+declare void @llvm.arm64.neon.st4lane.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64, i64*) nounwind readnone
+
+
+define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind {
+; CHECK: st2_8b
+; CHECK st2.8b
+	call void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P)
+	ret void
+}
+
+define void @st3_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P) nounwind {
+; CHECK: st3_8b
+; CHECK st3.8b
+	call void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %P)
+	ret void
+}
+
+define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind {
+; CHECK: st4_8b
+; CHECK st4.8b
+	call void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+
+define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind {
+; CHECK: st2_16b
+; CHECK st2.16b
+	call void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P)
+	ret void
+}
+
+define void @st3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P) nounwind {
+; CHECK: st3_16b
+; CHECK st3.16b
+	call void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %P)
+	ret void
+}
+
+define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind {
+; CHECK: st4_16b
+; CHECK st4.16b
+	call void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+
+define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind {
+; CHECK: st2_4h
+; CHECK st2.4h
+	call void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P)
+	ret void
+}
+
+define void @st3_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P) nounwind {
+; CHECK: st3_4h
+; CHECK st3.4h
+	call void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %P)
+	ret void
+}
+
+define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind {
+; CHECK: st4_4h
+; CHECK st4.4h
+	call void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+
+define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind {
+; CHECK: st2_8h
+; CHECK st2.8h
+	call void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P)
+	ret void
+}
+
+define void @st3_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P) nounwind {
+; CHECK: st3_8h
+; CHECK st3.8h
+	call void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %P)
+	ret void
+}
+
+define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind {
+; CHECK: st4_8h
+; CHECK st4.8h
+	call void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+
+define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind {
+; CHECK: st2_2s
+; CHECK st2.2s
+	call void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P)
+	ret void
+}
+
+define void @st3_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P) nounwind {
+; CHECK: st3_2s
+; CHECK st3.2s
+	call void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %P)
+	ret void
+}
+
+define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind {
+; CHECK: st4_2s
+; CHECK st4.2s
+	call void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+
+define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind {
+; CHECK: st2_4s
+; CHECK st2.4s
+	call void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P)
+	ret void
+}
+
+define void @st3_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P) nounwind {
+; CHECK: st3_4s
+; CHECK st3.4s
+	call void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %P)
+	ret void
+}
+
+define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind {
+; CHECK: st4_4s
+; CHECK st4.4s
+	call void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+
+define void @st2_1d(<1 x i64> %A, <1 x i64> %B, i64* %P) nounwind {
+; CHECK: st2_1d
+; CHECK st1.2d
+	call void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %P)
+	ret void
+}
+
+define void @st3_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P) nounwind {
+; CHECK: st3_1d
+; CHECK st1.3d
+	call void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %P)
+	ret void
+}
+
+define void @st4_1d(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P) nounwind {
+; CHECK: st4_1d
+; CHECK st1.4d
+	call void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+
+define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind {
+; CHECK: st2_2d
+; CHECK st2.2d
+	call void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P)
+	ret void
+}
+
+define void @st3_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P) nounwind {
+; CHECK: st3_2d
+; CHECK st2.3d
+	call void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %P)
+	ret void
+}
+
+define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind {
+; CHECK: st4_2d
+; CHECK st2.4d
+	call void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P)
+	ret void
+}
+
+declare void @llvm.arm64.neon.st2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+
+declare void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x2_v8i8(<8 x i8> %A, <8 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %addr)
+  ret void
+}
+
+define void @st1_x2_v4i16(<4 x i16> %A, <4 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %addr)
+  ret void
+}
+
+define void @st1_x2_v2i32(<2 x i32> %A, <2 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %addr)
+  ret void
+}
+
+define void @st1_x2_v2f32(<2 x float> %A, <2 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v2f32.p0f32(<2 x float> %A, <2 x float> %B, float* %addr)
+  ret void
+}
+
+define void @st1_x2_v1i64(<1 x i64> %A, <1 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, i64* %addr)
+  ret void
+}
+
+define void @st1_x2_v1f64(<1 x double> %A, <1 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v1f64.p0f64(<1 x double> %A, <1 x double> %B, double* %addr)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x2_v16i8(<16 x i8> %A, <16 x i8> %B, i8* %addr) {
+; CHECK-LABEL: st1_x2_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %addr)
+  ret void
+}
+
+define void @st1_x2_v8i16(<8 x i16> %A, <8 x i16> %B, i16* %addr) {
+; CHECK-LABEL: st1_x2_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %addr)
+  ret void
+}
+
+define void @st1_x2_v4i32(<4 x i32> %A, <4 x i32> %B, i32* %addr) {
+; CHECK-LABEL: st1_x2_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %addr)
+  ret void
+}
+
+define void @st1_x2_v4f32(<4 x float> %A, <4 x float> %B, float* %addr) {
+; CHECK-LABEL: st1_x2_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v4f32.p0f32(<4 x float> %A, <4 x float> %B, float* %addr)
+  ret void
+}
+
+define void @st1_x2_v2i64(<2 x i64> %A, <2 x i64> %B, i64* %addr) {
+; CHECK-LABEL: st1_x2_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %addr)
+  ret void
+}
+
+define void @st1_x2_v2f64(<2 x double> %A, <2 x double> %B, double* %addr) {
+; CHECK-LABEL: st1_x2_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x2.v2f64.p0f64(<2 x double> %A, <2 x double> %B, double* %addr)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x3_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, i8* %addr)
+  ret void
+}
+
+define void @st1_x3_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i16* %addr)
+  ret void
+}
+
+define void @st1_x3_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, i32* %addr)
+  ret void
+}
+
+define void @st1_x3_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, float* %addr)
+  ret void
+}
+
+define void @st1_x3_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, i64* %addr)
+  ret void
+}
+
+define void @st1_x3_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, double* %addr)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x3_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr) {
+; CHECK-LABEL: st1_x3_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, i8* %addr)
+  ret void
+}
+
+define void @st1_x3_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr) {
+; CHECK-LABEL: st1_x3_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i16* %addr)
+  ret void
+}
+
+define void @st1_x3_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr) {
+; CHECK-LABEL: st1_x3_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, i32* %addr)
+  ret void
+}
+
+define void @st1_x3_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr) {
+; CHECK-LABEL: st1_x3_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, float* %addr)
+  ret void
+}
+
+define void @st1_x3_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr) {
+; CHECK-LABEL: st1_x3_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, i64* %addr)
+  ret void
+}
+
+define void @st1_x3_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr) {
+; CHECK-LABEL: st1_x3_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x3.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, double* %addr)
+  ret void
+}
+
+
+declare void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float>, <2 x float>, <2 x float>, <2 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double>, <1 x double>, <1 x double>, <1 x double>, double*) nounwind readonly
+
+define void @st1_x4_v8i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v8i8:
+; CHECK: st1.8b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %addr)
+  ret void
+}
+
+define void @st1_x4_v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v4i16:
+; CHECK: st1.4h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %addr)
+  ret void
+}
+
+define void @st1_x4_v2i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v2i32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %addr)
+  ret void
+}
+
+define void @st1_x4_v2f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v2f32:
+; CHECK: st1.2s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v2f32.p0f32(<2 x float> %A, <2 x float> %B, <2 x float> %C, <2 x float> %D, float* %addr)
+  ret void
+}
+
+define void @st1_x4_v1i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v1i64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v1i64.p0i64(<1 x i64> %A, <1 x i64> %B, <1 x i64> %C, <1 x i64> %D, i64* %addr)
+  ret void
+}
+
+define void @st1_x4_v1f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v1f64:
+; CHECK: st1.1d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v1f64.p0f64(<1 x double> %A, <1 x double> %B, <1 x double> %C, <1 x double> %D, double* %addr)
+  ret void
+}
+
+declare void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i8*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i16*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float>, <4 x float>, <4 x float>, <4 x float>, float*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i64*) nounwind readonly
+declare void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double>, <2 x double>, <2 x double>, <2 x double>, double*) nounwind readonly
+
+define void @st1_x4_v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr) {
+; CHECK-LABEL: st1_x4_v16i8:
+; CHECK: st1.16b { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %addr)
+  ret void
+}
+
+define void @st1_x4_v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr) {
+; CHECK-LABEL: st1_x4_v8i16:
+; CHECK: st1.8h { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %addr)
+  ret void
+}
+
+define void @st1_x4_v4i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr) {
+; CHECK-LABEL: st1_x4_v4i32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %addr)
+  ret void
+}
+
+define void @st1_x4_v4f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr) {
+; CHECK-LABEL: st1_x4_v4f32:
+; CHECK: st1.4s { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v4f32.p0f32(<4 x float> %A, <4 x float> %B, <4 x float> %C, <4 x float> %D, float* %addr)
+  ret void
+}
+
+define void @st1_x4_v2i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr) {
+; CHECK-LABEL: st1_x4_v2i64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %addr)
+  ret void
+}
+
+define void @st1_x4_v2f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr) {
+; CHECK-LABEL: st1_x4_v2f64:
+; CHECK: st1.2d { {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} }, [x0]
+  call void @llvm.arm64.neon.st1x4.v2f64.p0f64(<2 x double> %A, <2 x double> %B, <2 x double> %C, <2 x double> %D, double* %addr)
+  ret void
+}
diff --git a/test/CodeGen/ARM64/stack-no-frame.ll b/test/CodeGen/ARM64/stack-no-frame.ll
new file mode 100644
index 0000000000..b5970c00ff
--- /dev/null
+++ b/test/CodeGen/ARM64/stack-no-frame.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s
+
+@global = global [20 x i64] zeroinitializer, align 8
+
+; The following function has enough locals to need some restoring, but not a
+; frame record. In an intermediate frame refactoring, prologue and epilogue were
+; inconsistent about how much to move SP.
+define void @test_stack_no_frame() {
+; CHECK: test_stack_no_frame
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+  %local = alloca [20 x i64]
+  %val = load volatile [20 x i64]* @global, align 8
+  store volatile [20 x i64] %val, [20 x i64]* %local, align 8
+
+  %val2 = load volatile [20 x i64]* %local, align 8
+  store volatile [20 x i64] %val2, [20 x i64]* @global, align 8
+
+; CHECK: add sp, sp, #[[STACKSIZE]]
+  ret void
+}
diff --git a/test/CodeGen/ARM64/stackmap.ll b/test/CodeGen/ARM64/stackmap.ll
new file mode 100644
index 0000000000..cc4e7f2fd3
--- /dev/null
+++ b/test/CodeGen/ARM64/stackmap.ll
@@ -0,0 +1,281 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+;
+; Note: Print verbose stackmaps using -debug-only=stackmaps.
+
+; We are not getting the correct stack alignment when cross compiling for arm64.
+; So specify a datalayout here.
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:  __LLVM_StackMaps:
+; CHECK-NEXT:   .long   0
+; Num Functions
+; CHECK-NEXT:   .long 11
+; CHECK-NEXT:   .long _constantargs
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _osrinline
+; CHECK-NEXT:   .long 32
+; CHECK-NEXT:   .long _osrcold
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _propertyRead
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _propertyWrite
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _jsVoidCall
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _jsIntCall
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _spilledValue
+; CHECK-NEXT:   .long 160
+; CHECK-NEXT:   .long _spilledStackMapValue
+; CHECK-NEXT:   .long 128
+; CHECK-NEXT:   .long _liveConstant
+; CHECK-NEXT:   .long 16
+; CHECK-NEXT:   .long _clobberLR
+; CHECK-NEXT:   .long 112
+; Num LargeConstants
+; CHECK-NEXT:   .long   2
+; CHECK-NEXT:   .quad   4294967295
+; CHECK-NEXT:   .quad   4294967296
+; Num Callsites
+; CHECK-NEXT:   .long   11
+
+; Constant arguments
+;
+; CHECK-NEXT:   .quad   1
+; CHECK-NEXT:   .long   L{{.*}}-_constantargs
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  4
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65535
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   65536
+; SmallConstant
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
+; LargeConstant at index 0
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   1
+
+define void @constantargs() {
+entry:
+  %0 = inttoptr i64 244837814094590 to i8*
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 20, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  ret void
+}
+
+; Inline OSR Exit
+;
+; CHECK-LABEL:  .long   L{{.*}}-_osrinline
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrinline(i64 %a, i64 %b) {
+entry:
+  ; Runtime void->void call.
+  call void inttoptr (i64 244837814094590 to void ()*)()
+  ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
+  ret void
+}
+
+; Cold OSR Exit
+;
+; 2 live variables in register.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_osrcold
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long  0
+define void @osrcold(i64 %a, i64 %b) {
+entry:
+  %test = icmp slt i64 %a, %b
+  br i1 %test, label %ret, label %cold
+cold:
+  ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
+  %thunk = inttoptr i64 244837814094590 to i8*
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 20, i8* %thunk, i32 0, i64 %a, i64 %b)
+  unreachable
+ret:
+  ret void
+}
+
+; Property Read
+; CHECK-LABEL:  .long   L{{.*}}-_propertyRead
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+;
+; FIXME: There are currently no stackmap entries. After moving to
+; AnyRegCC, we will have entries for the object and return value.
+define i64 @propertyRead(i64* %obj) {
+entry:
+  %resolveRead = inttoptr i64 244837814094590 to i8*
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 20, i8* %resolveRead, i32 1, i64* %obj)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Property Write
+; CHECK-LABEL:  .long   L{{.*}}-_propertyWrite
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
+entry:
+  %resolveWrite = inttoptr i64 244837814094590 to i8*
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 20, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  ret void
+}
+
+; Void JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_jsVoidCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 244837814094590 to i8*
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  ret void
+}
+
+; i64 JS Call
+;
+; 2 live variables in registers.
+;
+; CHECK-LABEL:  .long   L{{.*}}-_jsIntCall
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
+entry:
+  %resolveCall = inttoptr i64 244837814094590 to i8*
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 20, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %add = add i64 %result, 3
+  ret i64 %add
+}
+
+; Spilled stack map values.
+;
+; Verify 28 stack map entries.
+;
+; CHECK-LABEL:  .long L{{.*}}-_spilledValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 28
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect FP + ...
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27) {
+entry:
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 20, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27)
+  ret void
+}
+
+; Spilled stack map values.
+;
+; Verify 23 stack map entries.
+;
+; CHECK-LABEL:  .long L{{.*}}-_spilledStackMapValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 30
+;
+; Check that at least one is a spilled entry from RBP.
+; Location: Indirect FP + ...
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 29
+define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29) {
+entry:
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 16, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16, i64 %l17, i64 %l18, i64 %l19, i64 %l20, i64 %l21, i64 %l22, i64 %l23, i64 %l24, i64 %l25, i64 %l26, i64 %l27, i64 %l28, i64 %l29)
+  ret void
+}
+
+
+; Map a constant value.
+;
+; CHECK-LABEL:  .long L{{.*}}-_liveConstant
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   33
+
+define void @liveConstant() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 8, i32 33)
+  ret void
+}
+
+; Map a value when LR is the only free register.
+;
+; CHECK-LABEL:  .long L{{.*}}-_clobberLR
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: Indirect FP (r29) - offset
+; CHECK-NEXT:   .byte   3
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .short  29
+; CHECK-NEXT:   .long   -{{[0-9]+}}
+define void @clobberLR(i32 %a) {
+  tail call void asm sideeffect "nop", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{x29},~{x31}"() nounwind
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/ARM64/stacksave.ll b/test/CodeGen/ARM64/stacksave.ll
new file mode 100644
index 0000000000..a79e99ba32
--- /dev/null
+++ b/test/CodeGen/ARM64/stacksave.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -verify-coalescing
+; <rdar://problem/11522048>
+target triple = "arm64-apple-macosx10.8.0"
+
+; Verify that we can handle spilling the stack pointer without attempting
+; spilling it directly.
+; CHECK: f
+; CHECK: mov [[X0:x[0-9]+]], sp
+; CHECK: str [[X0]]
+; CHECK: inlineasm
+define void @f() nounwind ssp {
+entry:
+  %savedstack = call i8* @llvm.stacksave() nounwind
+  call void asm sideeffect "; inlineasm", "~{x0},~{x1},~{x2},~{x3},~{x4},~{x5},~{x6},~{x7},~{x8},~{x9},~{x10},~{x11},~{x12},~{x13},~{x14},~{x15},~{x16},~{x17},~{x18},~{x19},~{x20},~{x21},~{x22},~{x23},~{x24},~{x25},~{x26},~{x27},~{x28},~{fp},~{lr},~{sp},~{memory}"() nounwind
+  call void @llvm.stackrestore(i8* %savedstack) nounwind
+  ret void
+}
+
+declare i8* @llvm.stacksave() nounwind
+declare void @llvm.stackrestore(i8*) nounwind
diff --git a/test/CodeGen/ARM64/stp.ll b/test/CodeGen/ARM64/stp.ll
new file mode 100644
index 0000000000..eacf093aad
--- /dev/null
+++ b/test/CodeGen/ARM64/stp.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -march=arm64 -arm64-stp-suppress=false -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -march=arm64 -arm64-unscaled-mem-op=true\
+; RUN:   -verify-machineinstrs | FileCheck -check-prefix=STUR_CHK %s
+
+; CHECK: stp_int
+; CHECK: stp w0, w1, [x2]
+define void @stp_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
+  store i32 %a, i32* %p, align 4
+  %add.ptr = getelementptr inbounds i32* %p, i64 1
+  store i32 %b, i32* %add.ptr, align 4
+  ret void
+}
+
+; CHECK: stp_long
+; CHECK: stp x0, x1, [x2]
+define void @stp_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
+  store i64 %a, i64* %p, align 8
+  %add.ptr = getelementptr inbounds i64* %p, i64 1
+  store i64 %b, i64* %add.ptr, align 8
+  ret void
+}
+
+; CHECK: stp_float
+; CHECK: stp s0, s1, [x0]
+define void @stp_float(float %a, float %b, float* nocapture %p) nounwind {
+  store float %a, float* %p, align 4
+  %add.ptr = getelementptr inbounds float* %p, i64 1
+  store float %b, float* %add.ptr, align 4
+  ret void
+}
+
+; CHECK: stp_double
+; CHECK: stp d0, d1, [x0]
+define void @stp_double(double %a, double %b, double* nocapture %p) nounwind {
+  store double %a, double* %p, align 8
+  %add.ptr = getelementptr inbounds double* %p, i64 1
+  store double %b, double* %add.ptr, align 8
+  ret void
+}
+
+; Test the load/store optimizer---combine ldurs into a ldp, if appropriate
+define void @stur_int(i32 %a, i32 %b, i32* nocapture %p) nounwind {
+; STUR_CHK: stur_int
+; STUR_CHK: stp w{{[0-9]+}}, {{w[0-9]+}}, [x{{[0-9]+}}, #-8]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i32* %p, i32 -1
+  store i32 %a, i32* %p1, align 2
+  %p2 = getelementptr inbounds i32* %p, i32 -2
+  store i32 %b, i32* %p2, align 2
+  ret void
+}
+
+define void @stur_long(i64 %a, i64 %b, i64* nocapture %p) nounwind {
+; STUR_CHK: stur_long
+; STUR_CHK: stp x{{[0-9]+}}, {{x[0-9]+}}, [x{{[0-9]+}}, #-16]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds i64* %p, i32 -1
+  store i64 %a, i64* %p1, align 2
+  %p2 = getelementptr inbounds i64* %p, i32 -2
+  store i64 %b, i64* %p2, align 2
+  ret void
+}
+
+define void @stur_float(float %a, float %b, float* nocapture %p) nounwind {
+; STUR_CHK: stur_float
+; STUR_CHK: stp s{{[0-9]+}}, {{s[0-9]+}}, [x{{[0-9]+}}, #-8]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds float* %p, i32 -1
+  store float %a, float* %p1, align 2
+  %p2 = getelementptr inbounds float* %p, i32 -2
+  store float %b, float* %p2, align 2
+  ret void
+}
+
+define void @stur_double(double %a, double %b, double* nocapture %p) nounwind {
+; STUR_CHK: stur_double
+; STUR_CHK: stp d{{[0-9]+}}, {{d[0-9]+}}, [x{{[0-9]+}}, #-16]
+; STUR_CHK-NEXT: ret
+  %p1 = getelementptr inbounds double* %p, i32 -1
+  store double %a, double* %p1, align 2
+  %p2 = getelementptr inbounds double* %p, i32 -2
+  store double %b, double* %p2, align 2
+  ret void
+}
+
+define void @splat_v4i32(i32 %v, i32 *%p) {
+entry:
+
+; CHECK-LABEL: splat_v4i32
+; CHECK-DAG: stp w0, w0, [x1]
+; CHECK-DAG: stp w0, w0, [x1, #8]
+; CHECK: ret
+
+  %p17 = insertelement <4 x i32> undef, i32 %v, i32 0
+  %p18 = insertelement <4 x i32> %p17, i32 %v, i32 1
+  %p19 = insertelement <4 x i32> %p18, i32 %v, i32 2
+  %p20 = insertelement <4 x i32> %p19, i32 %v, i32 3
+  %p21 = bitcast i32* %p to <4 x i32>*
+  store <4 x i32> %p20, <4 x i32>* %p21, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM64/strict-align.ll b/test/CodeGen/ARM64/strict-align.ll
new file mode 100644
index 0000000000..e392172386
--- /dev/null
+++ b/test/CodeGen/ARM64/strict-align.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=arm64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-apple-darwin -arm64-strict-align | FileCheck %s --check-prefix=CHECK-STRICT
+
+define i32 @f0(i32* nocapture %p) nounwind {
+; CHECK-STRICT: ldrh [[HIGH:w[0-9]+]], [x0, #2]
+; CHECK-STRICT: ldrh [[LOW:w[0-9]+]], [x0]
+; CHECK-STRICT: orr w0, [[LOW]], [[HIGH]], lsl #16
+; CHECK-STRICT: ret
+
+; CHECK: ldr w0, [x0]
+; CHECK: ret
+  %tmp = load i32* %p, align 2
+  ret i32 %tmp
+}
+
+define i64 @f1(i64* nocapture %p) nounwind {
+; CHECK-STRICT:	ldp	w[[LOW:[0-9]+]], w[[HIGH:[0-9]+]], [x0]
+; CHECK-STRICT:	orr	x0, x[[LOW]], x[[HIGH]], lsl #32
+; CHECK-STRICT:	ret
+
+; CHECK: ldr x0, [x0]
+; CHECK: ret
+  %tmp = load i64* %p, align 4
+  ret i64 %tmp
+}
diff --git a/test/CodeGen/ARM64/stur.ll b/test/CodeGen/ARM64/stur.ll
new file mode 100644
index 0000000000..8326bba657
--- /dev/null
+++ b/test/CodeGen/ARM64/stur.ll
@@ -0,0 +1,98 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+%struct.X = type <{ i32, i64, i64 }>
+
+define void @foo1(i32* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: 	stur	w1, [x0, #-4]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i32
+  %ptr = getelementptr inbounds i32* %p, i64 -1
+  store i32 %tmp1, i32* %ptr, align 4
+  ret void
+}
+define void @foo2(i16* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK: 	sturh	w1, [x0, #-2]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i16
+  %ptr = getelementptr inbounds i16* %p, i64 -1
+  store i16 %tmp1, i16* %ptr, align 2
+  ret void
+}
+define void @foo3(i8* %p, i64 %val) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK: 	sturb	w1, [x0, #-1]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i64 %val to i8
+  %ptr = getelementptr inbounds i8* %p, i64 -1
+  store i8 %tmp1, i8* %ptr, align 1
+  ret void
+}
+define void @foo4(i16* %p, i32 %val) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK: 	sturh	w1, [x0, #-2]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i32 %val to i16
+  %ptr = getelementptr inbounds i16* %p, i32 -1
+  store i16 %tmp1, i16* %ptr, align 2
+  ret void
+}
+define void @foo5(i8* %p, i32 %val) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK: 	sturb	w1, [x0, #-1]
+; CHECK-NEXT: 	ret
+  %tmp1 = trunc i32 %val to i8
+  %ptr = getelementptr inbounds i8* %p, i32 -1
+  store i8 %tmp1, i8* %ptr, align 1
+  ret void
+}
+
+define void @foo(%struct.X* nocapture %p) nounwind optsize ssp {
+; CHECK-LABEL: foo:
+; CHECK-NOT: str
+; CHECK: stur    xzr, [x0, #12]
+; CHECK-NEXT: stur    xzr, [x0, #4]
+; CHECK-NEXT: ret
+  %B = getelementptr inbounds %struct.X* %p, i64 0, i32 1
+  %val = bitcast i64* %B to i8*
+  call void @llvm.memset.p0i8.i64(i8* %val, i8 0, i64 16, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+; Unaligned 16b stores are split into 8b stores for performance.
+; radar://15424193
+
+; CHECK-LABEL: unaligned:
+; CHECK-NOT: str q0
+; CHECK: str     d[[REG:[0-9]+]], [x0]
+; CHECK: ext.16b v[[REG2:[0-9]+]], v[[REG]], v[[REG]], #8
+; CHECK: str     d[[REG2]], [x0, #8]
+define void @unaligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 4
+  ret void
+}
+
+; CHECK-LABEL: aligned:
+; CHECK: str q0
+define void @aligned(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p
+  ret void
+}
+
+; Don't split one and two byte aligned stores.
+; radar://16349308
+
+; CHECK-LABEL: twobytealign:
+; CHECK: str q0
+define void @twobytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 2
+  ret void
+}
+; CHECK-LABEL: onebytealign:
+; CHECK: str q0
+define void @onebytealign(<4 x i32>* %p, <4 x i32> %v) nounwind {
+  store <4 x i32> %v, <4 x i32>* %p, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM64/subvector-extend.ll b/test/CodeGen/ARM64/subvector-extend.ll
new file mode 100644
index 0000000000..ad2f06ce7b
--- /dev/null
+++ b/test/CodeGen/ARM64/subvector-extend.ll
@@ -0,0 +1,141 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+; Test efficient codegen of vector extends up from legal type to 128 bit
+; and 256 bit vector types.
+
+;-----
+; Vectors of i16.
+;-----
+define <8 x i16> @func1(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: func1:
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i8> %v0 to <8 x i16>
+  ret <8 x i16> %r
+}
+
+define <8 x i16> @func2(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: func2:
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i8> %v0 to <8 x i16>
+  ret <8 x i16> %r
+}
+
+define <16 x i16> @func3(<16 x i8> %v0) nounwind {
+; CHECK-LABEL: func3:
+; CHECK-NEXT: ushll2.8h  v1, v0, #0
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <16 x i8> %v0 to <16 x i16>
+  ret <16 x i16> %r
+}
+
+define <16 x i16> @func4(<16 x i8> %v0) nounwind {
+; CHECK-LABEL: func4:
+; CHECK-NEXT: sshll2.8h  v1, v0, #0
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <16 x i8> %v0 to <16 x i16>
+  ret <16 x i16> %r
+}
+
+;-----
+; Vectors of i32.
+;-----
+
+define <4 x i32> @afunc1(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc1:
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i16> %v0 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @afunc2(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc2:
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i16> %v0 to <4 x i32>
+  ret <4 x i32> %r
+}
+
+define <8 x i32> @afunc3(<8 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc3:
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i16> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @afunc4(<8 x i16> %v0) nounwind {
+; CHECK-LABEL: afunc4:
+; CHECK-NEXT: sshll2.4s v1, v0, #0
+; CHECK-NEXT: sshll.4s v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i16> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @bfunc1(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: bfunc1:
+; CHECK-NEXT: ushll.8h  v0, v0, #0
+; CHECK-NEXT: ushll2.4s v1, v0, #0
+; CHECK-NEXT: ushll.4s  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <8 x i8> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+define <8 x i32> @bfunc2(<8 x i8> %v0) nounwind {
+; CHECK-LABEL: bfunc2:
+; CHECK-NEXT: sshll.8h  v0, v0, #0
+; CHECK-NEXT: sshll2.4s v1, v0, #0
+; CHECK-NEXT: sshll.4s  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <8 x i8> %v0 to <8 x i32>
+  ret <8 x i32> %r
+}
+
+;-----
+; Vectors of i64.
+;-----
+
+define <4 x i64> @zfunc1(<4 x i32> %v0) nounwind {
+; CHECK-LABEL: zfunc1:
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i32> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @zfunc2(<4 x i32> %v0) nounwind {
+; CHECK-LABEL: zfunc2:
+; CHECK-NEXT: sshll2.2d v1, v0, #0
+; CHECK-NEXT: sshll.2d v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i32> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @bfunc3(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: func3:
+; CHECK-NEXT: ushll.4s  v0, v0, #0
+; CHECK-NEXT: ushll2.2d v1, v0, #0
+; CHECK-NEXT: ushll.2d  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = zext <4 x i16> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
+
+define <4 x i64> @cfunc4(<4 x i16> %v0) nounwind {
+; CHECK-LABEL: func4:
+; CHECK-NEXT: sshll.4s  v0, v0, #0
+; CHECK-NEXT: sshll2.2d v1, v0, #0
+; CHECK-NEXT: sshll.2d  v0, v0, #0
+; CHECK-NEXT: ret
+  %r = sext <4 x i16> %v0 to <4 x i64>
+  ret <4 x i64> %r
+}
diff --git a/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll b/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
new file mode 100644
index 0000000000..4ab2bee0ed
--- /dev/null
+++ b/test/CodeGen/ARM64/swizzle-tbl-i16-layout.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+; rdar://13214163 - Make sure we generate a correct lookup table for the TBL
+; instruction when the element size of the vector is not 8 bits. We were
+; getting both the endianness wrong and the element indexing wrong.
+define <8 x i16> @foo(<8 x i16> %a) nounwind readnone {
+; CHECK:	.section	__TEXT,__literal16,16byte_literals
+; CHECK:	.align	4
+; CHECK:lCPI0_0:
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	0                       ; 0x0
+; CHECK:	.byte	1                       ; 0x1
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.byte	8                       ; 0x8
+; CHECK:	.byte	9                       ; 0x9
+; CHECK:	.section __TEXT,__text,regular,pure_instructions
+; CHECK:	.globl	_foo
+; CHECK:	.align	2
+; CHECK:_foo:                                   ; @foo
+; CHECK:	adrp	[[BASE:x[0-9]+]], lCPI0_0@PAGE
+; CHECK:	ldr	q[[REG:[0-9]+]], {{\[}}[[BASE]], lCPI0_0@PAGEOFF]
+; CHECK:	tbl.16b	v0, { v0 }, v[[REG]]
+; CHECK:	ret
+
+  %val = shufflevector <8 x i16> %a, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i16> %val
+}
diff --git a/test/CodeGen/ARM64/tbl.ll b/test/CodeGen/ARM64/tbl.ll
new file mode 100644
index 0000000000..e1edd21d8a
--- /dev/null
+++ b/test/CodeGen/ARM64/tbl.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @tbl1_8b(<16 x i8> %A, <8 x i8> %B) nounwind {
+; CHECK: tbl1_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8> %A, <8 x i8> %B)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl1_16b(<16 x i8> %A, <16 x i8> %B) nounwind {
+; CHECK: tbl1_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8> %A, <16 x i8> %B)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl2_8b(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C) {
+; CHECK: tbl2_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) {
+; CHECK: tbl2_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl3_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbl3_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbl3_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbl4_8b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbl4_8b
+; CHECK: tbl.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbl4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbl4_16b
+; CHECK: tbl.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.tbl1.v8i8(<16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbl1.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbl2.v8i8(<16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbl2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbl3.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbl3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbl4.v8i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbl4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
+define <8 x i8> @tbx1_8b(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C) nounwind {
+; CHECK: tbx1_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8> %A, <16 x i8> %B, <8 x i8> %C)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx1_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C) nounwind {
+; CHECK: tbx1_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx2_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D) {
+; CHECK: tbx2_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <8 x i8> %D)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx2_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D) {
+; CHECK: tbx2_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx3_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E) {
+; CHECK: tbx3_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx3.v8i8(< 8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <8 x i8> %E)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx3_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E) {
+; CHECK: tbx3_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E)
+  ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @tbx4_8b(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F) {
+; CHECK: tbx4_8b
+; CHECK: tbx.8b
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <8 x i8> %F)
+  ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @tbx4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F) {
+; CHECK: tbx4_16b
+; CHECK: tbx.16b
+  %tmp3 = call <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, <16 x i8> %E, <16 x i8> %F)
+  ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.tbx1.v8i8(<8 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbx1.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbx2.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbx2.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbx3.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbx3.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.tbx4.v8i8(<8 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.tbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+
diff --git a/test/CodeGen/ARM64/this-return.ll b/test/CodeGen/ARM64/this-return.ll
new file mode 100644
index 0000000000..30f5b9b064
--- /dev/null
+++ b/test/CodeGen/ARM64/this-return.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+%struct.A = type { i8 }
+%struct.B = type { i32 }
+%struct.C = type { %struct.B }
+%struct.D = type { %struct.B }
+%struct.E = type { %struct.B, %struct.B }
+
+declare %struct.A* @A_ctor_base(%struct.A* returned)
+declare %struct.B* @B_ctor_base(%struct.B* returned, i32)
+declare %struct.B* @B_ctor_complete(%struct.B* returned, i32)
+
+declare %struct.A* @A_ctor_base_nothisret(%struct.A*)
+declare %struct.B* @B_ctor_base_nothisret(%struct.B*, i32)
+declare %struct.B* @B_ctor_complete_nothisret(%struct.B*, i32)
+
+define %struct.C* @C_ctor_base(%struct.C* returned %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_base:
+; CHECK-NOT: mov {{x[0-9]+}}, x0
+; CHECK: bl {{_?A_ctor_base}}
+; CHECK-NOT: mov x0, {{x[0-9]+}}
+; CHECK: b {{_?B_ctor_base}}
+  %0 = bitcast %struct.C* %this to %struct.A*
+  %call = tail call %struct.A* @A_ctor_base(%struct.A* %0)
+  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
+  %call2 = tail call %struct.B* @B_ctor_base(%struct.B* %1, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_base_nothisret:
+; CHECK: mov [[SAVETHIS:x[0-9]+]], x0
+; CHECK: bl {{_?A_ctor_base_nothisret}}
+; CHECK: mov x0, [[SAVETHIS]]
+; CHECK-NOT: b {{_?B_ctor_base_nothisret}}
+  %0 = bitcast %struct.C* %this to %struct.A*
+  %call = tail call %struct.A* @A_ctor_base_nothisret(%struct.A* %0)
+  %1 = getelementptr inbounds %struct.C* %this, i32 0, i32 0
+  %call2 = tail call %struct.B* @B_ctor_base_nothisret(%struct.B* %1, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_complete(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_complete:
+; CHECK: b {{_?C_ctor_base}}
+  %call = tail call %struct.C* @C_ctor_base(%struct.C* %this, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.C* @C_ctor_complete_nothisret(%struct.C* %this, i32 %x) {
+entry:
+; CHECK-LABEL: C_ctor_complete_nothisret:
+; CHECK-NOT: b {{_?C_ctor_base_nothisret}}
+  %call = tail call %struct.C* @C_ctor_base_nothisret(%struct.C* %this, i32 %x)
+  ret %struct.C* %this
+}
+
+define %struct.D* @D_ctor_base(%struct.D* %this, i32 %x) {
+entry:
+; CHECK-LABEL: D_ctor_base:
+; CHECK-NOT: mov {{x[0-9]+}}, x0
+; CHECK: bl {{_?B_ctor_complete}}
+; CHECK-NOT: mov x0, {{x[0-9]+}}
+; CHECK: b {{_?B_ctor_complete}}
+  %b = getelementptr inbounds %struct.D* %this, i32 0, i32 0
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  ret %struct.D* %this
+}
+
+define %struct.E* @E_ctor_base(%struct.E* %this, i32 %x) {
+entry:
+; CHECK-LABEL: E_ctor_base:
+; CHECK-NOT: b {{_?B_ctor_complete}}
+  %b = getelementptr inbounds %struct.E* %this, i32 0, i32 0
+  %call = tail call %struct.B* @B_ctor_complete(%struct.B* %b, i32 %x)
+  %b2 = getelementptr inbounds %struct.E* %this, i32 0, i32 1
+  %call2 = tail call %struct.B* @B_ctor_complete(%struct.B* %b2, i32 %x)
+  ret %struct.E* %this
+}
diff --git a/test/CodeGen/ARM64/tls-darwin.ll b/test/CodeGen/ARM64/tls-darwin.ll
new file mode 100644
index 0000000000..5e8ec33ba4
--- /dev/null
+++ b/test/CodeGen/ARM64/tls-darwin.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+
+@var = thread_local global i8 0
+
+; N.b. x0 must be the result of the first load (i.e. the address of the
+; descriptor) when tlv_get_addr is called. Likewise the result is returned in
+; x0.
+define i8 @get_var() {
+; CHECK-LABEL: get_var:
+; CHECK: adrp x[[TLVPDESC_SLOT_HI:[0-9]+]], _var@TLVPPAGE
+; CHECK: ldr x0, [x[[TLVPDESC_SLOT_HI]], _var@TLVPPAGEOFF]
+; CHECK: ldr [[TLV_GET_ADDR:x[0-9]+]], [x0]
+; CHECK: blr [[TLV_GET_ADDR]]
+; CHECK: ldrb w0, [x0]
+
+  %val = load i8* @var, align 1
+  ret i8 %val
+}
diff --git a/test/CodeGen/ARM64/tls-dynamic-together.ll b/test/CodeGen/ARM64/tls-dynamic-together.ll
new file mode 100644
index 0000000000..3daae625c8
--- /dev/null
+++ b/test/CodeGen/ARM64/tls-dynamic-together.ll
@@ -0,0 +1,18 @@
+; RUN: llc -O0 -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+
+; If the .tlsdesccall and blr parts are emitted completely separately (even with
+; glue) then LLVM will separate them quite happily (with a spill at O0, hence
+; the option). This is definitely wrong, so we make sure they are emitted
+; together.
+
+@general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK-LABEL: test_generaldynamic:
+
+  %val = load i32* @general_dynamic_var
+  ret i32 %val
+
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr {{x[0-9]+}}
+}
diff --git a/test/CodeGen/ARM64/tls-dynamics.ll b/test/CodeGen/ARM64/tls-dynamics.ll
new file mode 100644
index 0000000000..e8a83fd7db
--- /dev/null
+++ b/test/CodeGen/ARM64/tls-dynamics.ll
@@ -0,0 +1,135 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+
+@general_dynamic_var = external thread_local global i32
+
+define i32 @test_generaldynamic() {
+; CHECK-LABEL: test_generaldynamic:
+
+  %val = load i32* @general_dynamic_var
+  ret i32 %val
+
+  ; FIXME: the adrp instructions are redundant (if harmless).
+; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], x0]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+define i32* @test_generaldynamic_addr() {
+; CHECK-LABEL: test_generaldynamic_addr:
+
+  ret i32* @general_dynamic_var
+
+  ; FIXME: the adrp instructions are redundant (if harmless).
+; CHECK: adrp [[TLSDESC_HI:x[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: add x0, [[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:general_dynamic_var
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:general_dynamic_var]
+; CHECK: .tlsdesccall general_dynamic_var
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], x0
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+}
+
+@local_dynamic_var = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic() {
+; CHECK-LABEL: test_localdynamic:
+
+  %val = load i32* @local_dynamic_var
+  ret i32 %val
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
+; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
+
+; CHECK: add x[[TPREL:[0-9]+]], x0, [[DTP_OFFSET]]
+
+; CHECK: mrs x[[TPIDR:[0-9]+]], TPIDR_EL0
+
+; CHECK: ldr w0, [x[[TPIDR]], x[[TPREL]]]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+define i32* @test_localdynamic_addr() {
+; CHECK-LABEL: test_localdynamic_addr:
+
+  ret i32* @local_dynamic_var
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK: movz [[DTP_OFFSET:x[0-9]+]], #:dtprel_g1:local_dynamic_var
+; CHECK: movk [[DTP_OFFSET]], #:dtprel_g0_nc:local_dynamic_var
+
+; CHECK: add [[TPREL:x[0-9]+]], x0, [[DTP_OFFSET]]
+
+; CHECK: mrs [[TPIDR:x[0-9]+]], TPIDR_EL0
+
+; CHECK: add x0, [[TPIDR]], [[TPREL]]
+
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADR_PAGE
+; CHECK-RELOC: R_AARCH64_TLSDESC_ADD_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_LD64_LO12_NC
+; CHECK-RELOC: R_AARCH64_TLSDESC_CALL
+
+}
+
+; The entire point of the local-dynamic access model is to have a single call to
+; the expensive resolver. Make sure we achieve that goal.
+
+@local_dynamic_var2 = external thread_local(localdynamic) global i32
+
+define i32 @test_localdynamic_deduplicate() {
+; CHECK-LABEL: test_localdynamic_deduplicate:
+
+  %val = load i32* @local_dynamic_var
+  %val2 = load i32* @local_dynamic_var2
+
+  %sum = add i32 %val, %val2
+  ret i32 %sum
+
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: add x0, x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_
+; CHECK: adrp x[[TLSDESC_HI:[0-9]+]], :tlsdesc:_TLS_MODULE_BASE_
+; CHECK: ldr [[CALLEE:x[0-9]+]], [x[[TLSDESC_HI]], :tlsdesc_lo12:_TLS_MODULE_BASE_]
+; CHECK: .tlsdesccall _TLS_MODULE_BASE_
+; CHECK-NEXT: blr [[CALLEE]]
+
+; CHECK-NOT: _TLS_MODULE_BASE_
+
+; CHECK: ret
+}
diff --git a/test/CodeGen/ARM64/tls-execs.ll b/test/CodeGen/ARM64/tls-execs.ll
new file mode 100644
index 0000000000..f0130d8588
--- /dev/null
+++ b/test/CodeGen/ARM64/tls-execs.ll
@@ -0,0 +1,63 @@
+; RUN: llc -mtriple=arm64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
+; RUN: llc -mtriple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
+
+@initial_exec_var = external thread_local(initialexec) global i32
+
+define i32 @test_initial_exec() {
+; CHECK-LABEL: test_initial_exec:
+  %val = load i32* @initial_exec_var
+
+; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
+; CHECK: ldr x[[TP_OFFSET:[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], x[[TP_OFFSET]]]
+
+; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
+; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
+
+  ret i32 %val
+}
+
+define i32* @test_initial_exec_addr() {
+; CHECK-LABEL: test_initial_exec_addr:
+  ret i32* @initial_exec_var
+
+; CHECK: adrp x[[GOTADDR:[0-9]+]], :gottprel:initial_exec_var
+; CHECK: ldr [[TP_OFFSET:x[0-9]+]], [x[[GOTADDR]], :gottprel_lo12:initial_exec_var]
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], [[TP_OFFSET]]
+
+; CHECK-RELOC: R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21
+; CHECK-RELOC: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC
+
+}
+
+@local_exec_var = thread_local(localexec) global i32 0
+
+define i32 @test_local_exec() {
+; CHECK-LABEL: test_local_exec:
+  %val = load i32* @local_exec_var
+
+; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var // encoding: [0bAAA{{[01]+}},A,0b101AAAAA,0x92]
+; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
+; CHECK: mrs x[[TP:[0-9]+]], TPIDR_EL0
+; CHECK: ldr w0, [x[[TP]], [[TP_OFFSET]]]
+
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+
+  ret i32 %val
+}
+
+define i32* @test_local_exec_addr() {
+; CHECK-LABEL: test_local_exec_addr:
+  ret i32* @local_exec_var
+
+; CHECK: movz [[TP_OFFSET:x[0-9]+]], #:tprel_g1:local_exec_var
+; CHECK: movk [[TP_OFFSET]], #:tprel_g0_nc:local_exec_var
+; CHECK: mrs [[TP:x[0-9]+]], TPIDR_EL0
+; CHECK: add x0, [[TP]], [[TP_OFFSET]]
+
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G1
+; CHECK-RELOC: R_AARCH64_TLSLE_MOVW_TPREL_G0_NC
+}
diff --git a/test/CodeGen/ARM64/trap.ll b/test/CodeGen/ARM64/trap.ll
new file mode 100644
index 0000000000..c9e0beabfc
--- /dev/null
+++ b/test/CodeGen/ARM64/trap.ll
@@ -0,0 +1,8 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define void @foo() nounwind {
+; CHECK: foo
+; CHECK: brk #1
+  tail call void @llvm.trap()
+  ret void
+}
+declare void @llvm.trap() nounwind
diff --git a/test/CodeGen/ARM64/trn.ll b/test/CodeGen/ARM64/trn.ll
new file mode 100644
index 0000000000..f46798490f
--- /dev/null
+++ b/test/CodeGen/ARM64/trn.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vtrni8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrni8:
+;CHECK: trn1.8b
+;CHECK: trn2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vtrni16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrni16:
+;CHECK: trn1.4h
+;CHECK: trn2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+; 2xi32 TRN is redundant with ZIP
+define <2 x i32> @vtrni32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vtrni32:
+;CHECK: zip1.2s
+;CHECK: zip2.2s
+;CHECK-NEXT: add.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 0, i32 2>
+	%tmp4 = shufflevector <2 x i32> %tmp1, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 3>
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+	ret <2 x i32> %tmp5
+}
+
+define <2 x float> @vtrnf(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: vtrnf:
+;CHECK: zip1.2s
+;CHECK: zip2.2s
+;CHECK-NEXT: fadd.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 0, i32 2>
+	%tmp4 = shufflevector <2 x float> %tmp1, <2 x float> %tmp2, <2 x i32> <i32 1, i32 3>
+        %tmp5 = fadd <2 x float> %tmp3, %tmp4
+	ret <2 x float> %tmp5
+}
+
+define <16 x i8> @vtrnQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrnQi8:
+;CHECK: trn1.16b
+;CHECK: trn2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vtrnQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrnQi16:
+;CHECK: trn1.8h
+;CHECK: trn2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vtrnQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vtrnQi32:
+;CHECK: trn1.4s
+;CHECK: trn2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vtrnQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vtrnQf:
+;CHECK: trn1.4s
+;CHECK: trn2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VTRN:
+
+define <8 x i8> @vtrni8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vtrni8_undef:
+;CHECK: trn1.8b
+;CHECK: trn2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 2, i32 10, i32 undef, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 undef, i32 undef, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <8 x i16> @vtrnQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vtrnQi16_undef:
+;CHECK: trn1.8h
+;CHECK: trn2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 undef, i32 undef, i32 4, i32 12, i32 6, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 undef, i32 3, i32 11, i32 5, i32 13, i32 undef, i32 undef>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
diff --git a/test/CodeGen/ARM64/trunc-store.ll b/test/CodeGen/ARM64/trunc-store.ll
new file mode 100644
index 0000000000..e65f5b56fe
--- /dev/null
+++ b/test/CodeGen/ARM64/trunc-store.ll
@@ -0,0 +1,75 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+define void @bar(<8 x i16> %arg, <8 x i8>* %p) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: xtn.8b v[[REG:[0-9]+]], v0
+; CHECK-NEXT: str d[[REG]], [x0]
+; CHECK-NEXT: ret
+  %tmp = trunc <8 x i16> %arg to <8 x i8>
+  store <8 x i8> %tmp, <8 x i8>* %p, align 8
+  ret void
+}
+
+@zptr8 = common global i8* null, align 8
+@zptr16 = common global i16* null, align 8
+@zptr32 = common global i32* null, align 8
+
+define void @fct32(i32 %arg, i64 %var) {
+; CHECK: fct32
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr32@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr32@GOTPAGEOFF]
+; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
+; w1 is %var truncated
+; CHECK-NEXT: str w1, {{\[}}[[GLOBALADDR]], x[[OFFSETREGNUM]], sxtw #2]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i32** @zptr32, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i32* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i32
+  store i32 %tmp, i32* %arrayidx9, align 4
+  ret void
+}
+
+define void @fct16(i32 %arg, i64 %var) {
+; CHECK: fct16
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr16@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr16@GOTPAGEOFF]
+; CHECK: ldr [[GLOBALADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: sub w[[OFFSETREGNUM:[0-9]+]], w0, #1
+; w1 is %var truncated
+; CHECK-NEXT: strh w1, {{\[}}[[GLOBALADDR]], x[[OFFSETREGNUM]], sxtw #1]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i16** @zptr16, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i16* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i16
+  store i16 %tmp, i16* %arrayidx9, align 4
+  ret void
+}
+
+define void @fct8(i32 %arg, i64 %var) {
+; CHECK: fct8
+; CHECK: adrp [[GLOBALPAGE:x[0-9]+]], _zptr8@GOTPAGE
+; CHECK: ldr [[GLOBALOFF:x[0-9]+]], {{\[}}[[GLOBALPAGE]], _zptr8@GOTPAGEOFF]
+; CHECK: ldr [[BASEADDR:x[0-9]+]], {{\[}}[[GLOBALOFF]]]
+; w0 is %arg
+; CHECK-NEXT: add [[ADDR:x[0-9]+]], [[BASEADDR]], w0, sxtw
+; w1 is %var truncated
+; CHECK-NEXT: sturb w1, {{\[}}[[ADDR]], #-1]
+; CHECK-NEXT: ret
+bb:
+  %.pre37 = load i8** @zptr8, align 8
+  %dec = add nsw i32 %arg, -1
+  %idxprom8 = sext i32 %dec to i64
+  %arrayidx9 = getelementptr inbounds i8* %.pre37, i64 %idxprom8
+  %tmp = trunc i64 %var to i8
+  store i8 %tmp, i8* %arrayidx9, align 4
+  ret void
+}
diff --git a/test/CodeGen/ARM64/umaxv.ll b/test/CodeGen/ARM64/umaxv.ll
new file mode 100644
index 0000000000..15277d32f0
--- /dev/null
+++ b/test/CodeGen/ARM64/umaxv.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @vmax_u8x8(<8 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u8x8:
+; CHECK: umaxv.8b        b[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...)
+
+define i32 @vmax_u4x16(<4 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u4x16:
+; CHECK: umaxv.4h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmax_u8x16(<8 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u8x16:
+; CHECK: umaxv.8h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmax_u16x8(<16 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmax_u16x8:
+; CHECK: umaxv.16b        b[[REG:[0-9]+]], v0
+; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.arm64.neon.umaxv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.arm64.neon.umaxv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/uminv.ll b/test/CodeGen/ARM64/uminv.ll
new file mode 100644
index 0000000000..440522f169
--- /dev/null
+++ b/test/CodeGen/ARM64/uminv.ll
@@ -0,0 +1,92 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define i32 @vmin_u8x8(<8 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u8x8:
+; CHECK: uminv.8b        b[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...)
+
+define i32 @vmin_u4x16(<4 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u4x16:
+; CHECK: uminv.4h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmin_u8x16(<8 x i16> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u8x16:
+; CHECK: uminv.8h        h[[REG:[0-9]+]], v0
+; CHECK: fmov    [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i16
+  %tobool = icmp eq i16 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @vmin_u16x8(<16 x i8> %a) nounwind ssp {
+; CHECK-LABEL: vmin_u16x8:
+; CHECK: uminv.16b        b[[REG:[0-9]+]], v0
+; CHECK: fmov     [[REG2:w[0-9]+]], s[[REG]]
+; CHECK-NOT: and
+; CHECK: cbz     [[REG2]],
+entry:
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %a) nounwind
+  %tmp = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %tmp, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() nounwind
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) nounwind readnone
+declare i32 @llvm.arm64.neon.uminv.i32.v8i16(<8 x i16>) nounwind readnone
+declare i32 @llvm.arm64.neon.uminv.i32.v4i16(<4 x i16>) nounwind readnone
+declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) nounwind readnone
diff --git a/test/CodeGen/ARM64/umov.ll b/test/CodeGen/ARM64/umov.ll
new file mode 100644
index 0000000000..770187448f
--- /dev/null
+++ b/test/CodeGen/ARM64/umov.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define zeroext i8 @f1(<16 x i8> %a) {
+; CHECK-LABEL: f1:
+; CHECK: umov.b w0, v0[3]
+; CHECK-NEXT: ret
+  %vecext = extractelement <16 x i8> %a, i32 3
+  ret i8 %vecext
+}
+
+define zeroext i16 @f2(<4 x i16> %a) {
+; CHECK-LABEL: f2:
+; CHECK: umov.h w0, v0[2]
+; CHECK-NEXT: ret
+  %vecext = extractelement <4 x i16> %a, i32 2
+  ret i16 %vecext
+}
+
+define i32 @f3(<2 x i32> %a) {
+; CHECK-LABEL: f3:
+; CHECK: umov.s w0, v0[1]
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i32> %a, i32 1
+  ret i32 %vecext
+}
+
+define i64 @f4(<2 x i64> %a) {
+; CHECK-LABEL: f4:
+; CHECK: umov.d x0, v0[1]
+; CHECK-NEXT: ret
+  %vecext = extractelement <2 x i64> %a, i32 1
+  ret i64 %vecext
+}
diff --git a/test/CodeGen/ARM64/unaligned_ldst.ll b/test/CodeGen/ARM64/unaligned_ldst.ll
new file mode 100644
index 0000000000..20b80c09f7
--- /dev/null
+++ b/test/CodeGen/ARM64/unaligned_ldst.ll
@@ -0,0 +1,41 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+; rdar://r11231896
+
+define void @t1(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: orr
+; CHECK: ldr [[X0:x[0-9]+]], [x1]
+; CHECK: str [[X0]], [x0]
+  %tmp1 = bitcast i8* %b to i64*
+  %tmp2 = bitcast i8* %a to i64*
+  %tmp3 = load i64* %tmp1, align 1
+  store i64 %tmp3, i64* %tmp2, align 1
+  ret void
+}
+
+define void @t2(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK-NOT: orr
+; CHECK: ldr [[W0:w[0-9]+]], [x1]
+; CHECK: str [[W0]], [x0]
+  %tmp1 = bitcast i8* %b to i32*
+  %tmp2 = bitcast i8* %a to i32*
+  %tmp3 = load i32* %tmp1, align 1
+  store i32 %tmp3, i32* %tmp2, align 1
+  ret void
+}
+
+define void @t3(i8* nocapture %a, i8* nocapture %b) nounwind {
+entry:
+; CHECK-LABEL: t3:
+; CHECK-NOT: orr
+; CHECK: ldrh [[W0:w[0-9]+]], [x1]
+; CHECK: strh [[W0]], [x0]
+  %tmp1 = bitcast i8* %b to i16*
+  %tmp2 = bitcast i8* %a to i16*
+  %tmp3 = load i16* %tmp1, align 1
+  store i16 %tmp3, i16* %tmp2, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM64/uzp.ll b/test/CodeGen/ARM64/uzp.ll
new file mode 100644
index 0000000000..60e16d0d68
--- /dev/null
+++ b/test/CodeGen/ARM64/uzp.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vuzpi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpi8:
+;CHECK: uzp1.8b
+;CHECK: uzp2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vuzpi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpi16:
+;CHECK: uzp1.4h
+;CHECK: uzp2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <16 x i8> @vuzpQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpQi8:
+;CHECK: uzp1.16b
+;CHECK: uzp2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vuzpQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpQi16:
+;CHECK: uzp1.8h
+;CHECK: uzp2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vuzpQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vuzpQi32:
+;CHECK: uzp1.4s
+;CHECK: uzp2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vuzpQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vuzpQf:
+;CHECK: uzp1.4s
+;CHECK: uzp2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VUZP:
+
+define <8 x i8> @vuzpi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vuzpi8_undef:
+;CHECK: uzp1.8b
+;CHECK: uzp2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 2, i32 undef, i32 undef, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 undef, i32 undef, i32 13, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <8 x i16> @vuzpQi16_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vuzpQi16_undef:
+;CHECK: uzp1.8h
+;CHECK: uzp2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 undef, i32 4, i32 undef, i32 8, i32 10, i32 12, i32 14>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 3, i32 5, i32 undef, i32 undef, i32 11, i32 13, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
diff --git a/test/CodeGen/ARM64/vaargs.ll b/test/CodeGen/ARM64/vaargs.ll
new file mode 100644
index 0000000000..ce07635a5c
--- /dev/null
+++ b/test/CodeGen/ARM64/vaargs.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64"
+target triple = "arm64-apple-darwin11.0.0"
+
+define float @t1(i8* nocapture %fmt, ...) nounwind ssp {
+entry:
+; CHECK: t1
+; CHECK: fcvt
+  %argp = alloca i8*, align 8
+  %argp1 = bitcast i8** %argp to i8*
+  call void @llvm.va_start(i8* %argp1)
+  %0 = va_arg i8** %argp, i32
+  %1 = va_arg i8** %argp, float
+  call void @llvm.va_end(i8* %argp1)
+  ret float %1
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+declare void @llvm.va_end(i8*) nounwind
diff --git a/test/CodeGen/ARM64/vabs.ll b/test/CodeGen/ARM64/vabs.ll
new file mode 100644
index 0000000000..7c2b75836f
--- /dev/null
+++ b/test/CodeGen/ARM64/vabs.ll
@@ -0,0 +1,796 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+
+define <8 x i16> @sabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sabdl8h:
+;CHECK: sabdl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sabdl4s:
+;CHECK: sabdl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sabdl2d:
+;CHECK: sabdl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @sabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sabdl2_8h:
+;CHECK: sabdl2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sabdl2_4s:
+;CHECK: sabdl2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sabdl2_2d:
+;CHECK: sabdl2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uabdl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uabdl8h:
+;CHECK: uabdl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uabdl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uabdl4s:
+;CHECK: uabdl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uabdl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uabdl2d:
+;CHECK: uabdl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+  ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uabdl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uabdl2_8h:
+;CHECK: uabdl2.8h
+  %load1 = load <16 x i8>* %A
+  %load2 = load <16 x i8>* %B
+  %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uabdl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uabdl2_4s:
+;CHECK: uabdl2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uabdl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uabdl2_2d:
+;CHECK: uabdl2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
+  ret <2 x i64> %tmp4
+}
+
+define <2 x float> @fabd_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fabd_2s:
+;CHECK: fabd.2s
+        %tmp1 = load <2 x float>* %A
+        %tmp2 = load <2 x float>* %B
+        %tmp3 = call <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        ret <2 x float> %tmp3
+}
+
+define <4 x float> @fabd_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fabd_4s:
+;CHECK: fabd.4s
+        %tmp1 = load <4 x float>* %A
+        %tmp2 = load <4 x float>* %B
+        %tmp3 = call <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        ret <4 x float> %tmp3
+}
+
+define <2 x double> @fabd_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fabd_2d:
+;CHECK: fabd.2d
+        %tmp1 = load <2 x double>* %A
+        %tmp2 = load <2 x double>* %B
+        %tmp3 = call <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fabd.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fabd.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fabd.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <8 x i8> @sabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sabd_8b:
+;CHECK: sabd.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sabd_16b:
+;CHECK: sabd.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sabd_4h:
+;CHECK: sabd.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sabd_8h:
+;CHECK: sabd.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sabd_2s:
+;CHECK: sabd.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sabd_4s:
+;CHECK: sabd.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @uabd_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uabd_8b:
+;CHECK: uabd.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uabd_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uabd_16b:
+;CHECK: uabd.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uabd_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uabd_4h:
+;CHECK: uabd.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uabd_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uabd_8h:
+;CHECK: uabd.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uabd_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uabd_2s:
+;CHECK: uabd.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uabd_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uabd_4s:
+;CHECK: uabd.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @sqabs_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqabs_8b:
+;CHECK: sqabs.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sqabs_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqabs_16b:
+;CHECK: sqabs.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sqabs_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqabs_4h:
+;CHECK: sqabs.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqabs_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqabs_8h:
+;CHECK: sqabs.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqabs_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqabs_2s:
+;CHECK: sqabs.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqabs_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqabs_4s:
+;CHECK: sqabs.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.sqabs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.sqabs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqabs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqabs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqabs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqabs.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @sqneg_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqneg_8b:
+;CHECK: sqneg.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sqneg_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqneg_16b:
+;CHECK: sqneg.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sqneg_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqneg_4h:
+;CHECK: sqneg.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqneg_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqneg_8h:
+;CHECK: sqneg.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqneg_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqneg_2s:
+;CHECK: sqneg.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqneg_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqneg_4s:
+;CHECK: sqneg.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.sqneg.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.sqneg.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqneg.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqneg.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqneg.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqneg.v4i32(<4 x i32>) nounwind readnone
+
+define <8 x i8> @abs_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: abs_8b:
+;CHECK: abs.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8> %tmp1)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @abs_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: abs_16b:
+;CHECK: abs.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8> %tmp1)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @abs_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: abs_4h:
+;CHECK: abs.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @abs_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: abs_8h:
+;CHECK: abs.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @abs_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: abs_2s:
+;CHECK: abs.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @abs_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: abs_4s:
+;CHECK: abs.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <1 x i64> @abs_1d(<1 x i64> %A) nounwind {
+; CHECK-LABEL: abs_1d:
+; CHECK: abs d0, d0
+  %abs = call <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64> %A)
+  ret <1 x i64> %abs
+}
+
+declare <8 x i8> @llvm.arm64.neon.abs.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.abs.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.abs.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.abs.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.abs.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.abs.v4i32(<4 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.abs.v1i64(<1 x i64>) nounwind readnone
+
+define <8 x i16> @sabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
+;CHECK-LABEL: sabal8h:
+;CHECK: sabal.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sabal4s:
+;CHECK: sabal.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sabal2d:
+;CHECK: sabal.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp4.1.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @sabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: sabal2_8h:
+;CHECK: sabal2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp4 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sabal2_4s:
+;CHECK: sabal2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp4 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sabal2_2d:
+;CHECK: sabal2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp4 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @uabal8h(<8 x i8>* %A, <8 x i8>* %B,  <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uabal8h:
+;CHECK: uabal.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uabal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uabal4s:
+;CHECK: uabal.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uabal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: uabal2d:
+;CHECK: uabal.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @uabal2_8h(<16 x i8>* %A, <16 x i8>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uabal2_8h:
+;CHECK: uabal2.8h
+        %load1 = load <16 x i8>* %A
+        %load2 = load <16 x i8>* %B
+        %tmp3 = load <8 x i16>* %C
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = shufflevector <16 x i8> %load2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp4 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4.1 = zext <8 x i8> %tmp4 to <8 x i16>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4.1
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uabal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uabal2_4s:
+;CHECK: uabal2.4s
+        %load1 = load <8 x i16>* %A
+        %load2 = load <8 x i16>* %B
+        %tmp3 = load <4 x i32>* %C
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp4 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4.1 = zext <4 x i16> %tmp4 to <4 x i32>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4.1
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uabal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: uabal2_2d:
+;CHECK: uabal2.2d
+        %load1 = load <4 x i32>* %A
+        %load2 = load <4 x i32>* %B
+        %tmp3 = load <2 x i64>* %C
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp4 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4.1 = zext <2 x i32> %tmp4 to <2 x i64>
+        %tmp5 = add <2 x i64> %tmp3, %tmp4.1
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @saba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK-LABEL: saba_8b:
+;CHECK: saba.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = load <8 x i8>* %C
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @saba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK-LABEL: saba_16b:
+;CHECK: saba.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp4 = load <16 x i8>* %C
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+        ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @saba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: saba_4h:
+;CHECK: saba.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = load <4 x i16>* %C
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @saba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: saba_8h:
+;CHECK: saba.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp4 = load <8 x i16>* %C
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @saba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: saba_2s:
+;CHECK: saba.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = load <2 x i32>* %C
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @saba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: saba_4s:
+;CHECK: saba.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp4 = load <4 x i32>* %C
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <8 x i8> @uaba_8b(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
+;CHECK-LABEL: uaba_8b:
+;CHECK: uaba.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uabd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        %tmp4 = load <8 x i8>* %C
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @uaba_16b(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
+;CHECK-LABEL: uaba_16b:
+;CHECK: uaba.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.uabd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        %tmp4 = load <16 x i8>* %C
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+        ret <16 x i8> %tmp5
+}
+
+define <4 x i16> @uaba_4h(<4 x i16>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind {
+;CHECK-LABEL: uaba_4h:
+;CHECK: uaba.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uabd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        %tmp4 = load <4 x i16>* %C
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <8 x i16> @uaba_8h(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
+;CHECK-LABEL: uaba_8h:
+;CHECK: uaba.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uabd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        %tmp4 = load <8 x i16>* %C
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <2 x i32> @uaba_2s(<2 x i32>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind {
+;CHECK-LABEL: uaba_2s:
+;CHECK: uaba.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        %tmp4 = load <2 x i32>* %C
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <4 x i32> @uaba_4s(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: uaba_4s:
+;CHECK: uaba.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uabd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        %tmp4 = load <4 x i32>* %C
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+; Scalar FABD
+define float @fabds(float %a, float %b) nounwind {
+; CHECK-LABEL: fabds:
+; CHECK: fabd s0, s0, s1
+  %vabd.i = tail call float @llvm.arm64.sisd.fabd.f32(float %a, float %b) nounwind
+  ret float %vabd.i
+}
+
+define double @fabdd(double %a, double %b) nounwind {
+; CHECK-LABEL: fabdd:
+; CHECK: fabd d0, d0, d1
+  %vabd.i = tail call double @llvm.arm64.sisd.fabd.f64(double %a, double %b) nounwind
+  ret double %vabd.i
+}
+
+declare double @llvm.arm64.sisd.fabd.f64(double, double) nounwind readnone
+declare float @llvm.arm64.sisd.fabd.f32(float, float) nounwind readnone
+
+define <2 x i64> @uabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uabdl_from_extract_dup:
+; CHECK-NOT: ext.16b
+; CHECK: uabdl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i32> @llvm.arm64.neon.uabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}
+
+define <2 x i64> @sabdl_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: sabdl_from_extract_dup:
+; CHECK-NOT: ext.16b
+; CHECK: sabdl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i32> @llvm.arm64.neon.sabd.v2i32(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  %res1 = zext <2 x i32> %res to <2 x i64>
+  ret <2 x i64> %res1
+}
diff --git a/test/CodeGen/ARM64/vadd.ll b/test/CodeGen/ARM64/vadd.ll
new file mode 100644
index 0000000000..f674c6de33
--- /dev/null
+++ b/test/CodeGen/ARM64/vadd.ll
@@ -0,0 +1,941 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+
+define <8 x i8> @addhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn8b:
+;CHECK: addhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @addhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn4h:
+;CHECK: addhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @addhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2s:
+;CHECK: addhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: addhn2_16b:
+;CHECK: addhn.8b
+;CHECK-NEXT: addhn2.16b
+  %vaddhn2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vaddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: addhn2_8h:
+;CHECK: addhn.4h
+;CHECK-NEXT: addhn2.8h
+  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vaddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: addhn2_4s:
+;CHECK: addhn.2s
+;CHECK-NEXT: addhn2.4s
+  %vaddhn2.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vaddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.arm64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+
+define <8 x i8> @raddhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: raddhn8b:
+;CHECK: raddhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @raddhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: raddhn4h:
+;CHECK: raddhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @raddhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: raddhn2s:
+;CHECK: raddhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
+;CHECK-LABEL: raddhn2_16b:
+;CHECK: raddhn.8b
+;CHECK-NEXT: raddhn2.16b
+  %vraddhn2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vraddhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
+;CHECK-LABEL: raddhn2_8h:
+;CHECK: raddhn.4h
+;CHECK-NEXT: raddhn2.8h
+  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vraddhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
+;CHECK-LABEL: raddhn2_4s:
+;CHECK: raddhn.2s
+;CHECK-NEXT: raddhn2.4s
+  %vraddhn2.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vraddhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.arm64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @saddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: saddl8h:
+;CHECK: saddl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @saddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: saddl4s:
+;CHECK: saddl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @saddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: saddl2d:
+;CHECK: saddl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
+; CHECK-LABEL: saddl2_8h:
+; CHECK-NEXT: saddl2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
+  %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
+; CHECK-LABEL: saddl2_4s:
+; CHECK-NEXT: saddl2.4s v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
+  %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
+; CHECK-LABEL: saddl2_2d:
+; CHECK-NEXT: saddl2.2d v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
+  %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @uaddl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddl8h:
+;CHECK: uaddl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = add <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uaddl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddl4s:
+;CHECK: uaddl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uaddl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddl2d:
+;CHECK: uaddl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+
+define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
+; CHECK-LABEL: uaddl2_8h:
+; CHECK-NEXT: uaddl2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
+  %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
+  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
+; CHECK-LABEL: uaddl2_4s:
+; CHECK-NEXT: uaddl2.4s v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
+  %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
+  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
+; CHECK-LABEL: uaddl2_2d:
+; CHECK-NEXT: uaddl2.2d v0, v0, v1
+; CHECK-NEXT: ret
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
+  %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
+  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @uaddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddw8h:
+;CHECK: uaddw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp4 = add <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uaddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddw4s:
+;CHECK: uaddw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp4 = add <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @uaddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddw2d:
+;CHECK: uaddw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp4 = add <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @uaddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uaddw2_8h:
+;CHECK: uaddw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+        %res = add <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @uaddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uaddw2_4s:
+;CHECK: uaddw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+        %res = add <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @uaddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uaddw2_2d:
+;CHECK: uaddw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+        %res = add <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <8 x i16> @saddw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: saddw8h:
+;CHECK: saddw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @saddw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: saddw4s:
+;CHECK: saddw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @saddw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: saddw2d:
+;CHECK: saddw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @saddw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: saddw2_8h:
+;CHECK: saddw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+        %res = add <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @saddw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: saddw2_4s:
+;CHECK: saddw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+        %res = add <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @saddw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: saddw2_2d:
+;CHECK: saddw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+        %res = add <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <4 x i16> @saddlp4h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: saddlp4h:
+;CHECK: saddlp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @saddlp2s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: saddlp2s:
+;CHECK: saddlp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @saddlp1d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: saddlp1d:
+;CHECK: saddlp.1d
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @saddlp8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: saddlp8h:
+;CHECK: saddlp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @saddlp4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: saddlp4s:
+;CHECK: saddlp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @saddlp2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: saddlp2d:
+;CHECK: saddlp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        ret <2 x i64> %tmp3
+}
+
+declare <4 x i16>  @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16>  @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define <4 x i16> @uaddlp4h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uaddlp4h:
+;CHECK: uaddlp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uaddlp2s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uaddlp2s:
+;CHECK: uaddlp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @uaddlp1d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uaddlp1d:
+;CHECK: uaddlp.1d
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
+        ret <1 x i64> %tmp3
+}
+
+define <8 x i16> @uaddlp8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: uaddlp8h:
+;CHECK: uaddlp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uaddlp4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uaddlp4s:
+;CHECK: uaddlp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uaddlp2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uaddlp2d:
+;CHECK: uaddlp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        ret <2 x i64> %tmp3
+}
+
+declare <4 x i16>  @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
+
+declare <8 x i16>  @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
+
+define <4 x i16> @sadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sadalp4h:
+;CHECK: sadalp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @sadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sadalp2s:
+;CHECK: sadalp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <8 x i16> @sadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sadalp8h:
+;CHECK: sadalp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @sadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sadalp4s:
+;CHECK: sadalp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sadalp2d:
+;CHECK: sadalp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <4 x i16> @uadalp4h(<8 x i8>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uadalp4h:
+;CHECK: uadalp.4h
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @uadalp2s(<4 x i16>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uadalp2s:
+;CHECK: uadalp.2s
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <8 x i16> @uadalp8h(<16 x i8>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uadalp8h:
+;CHECK: uadalp.8h
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @uadalp4s(<8 x i16>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uadalp4s:
+;CHECK: uadalp.4s
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @uadalp2d(<4 x i32>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uadalp2d:
+;CHECK: uadalp.2d
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @addp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: addp_8b:
+;CHECK: addp.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @addp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: addp_16b:
+;CHECK: addp.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @addp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: addp_4h:
+;CHECK: addp.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @addp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addp_8h:
+;CHECK: addp.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @addp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: addp_2s:
+;CHECK: addp.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @addp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addp_4s:
+;CHECK: addp.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @addp_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addp_2d:
+;CHECK: addp.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <2 x float> @faddp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: faddp_2s:
+;CHECK: faddp.2s
+        %tmp1 = load <2 x float>* %A
+        %tmp2 = load <2 x float>* %B
+        %tmp3 = call <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+        ret <2 x float> %tmp3
+}
+
+define <4 x float> @faddp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: faddp_4s:
+;CHECK: faddp.4s
+        %tmp1 = load <4 x float>* %A
+        %tmp2 = load <4 x float>* %B
+        %tmp3 = call <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+        ret <4 x float> %tmp3
+}
+
+define <2 x double> @faddp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: faddp_2d:
+;CHECK: faddp.2d
+        %tmp1 = load <2 x double>* %A
+        %tmp2 = load <2 x double>* %B
+        %tmp3 = call <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+        ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.addp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.addp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.addp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: uaddl2_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: uaddl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: saddl2_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: saddl2.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = add <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: usubl2_duprhs
+; CHECK-NOT: ext.16b
+; CHECK: usubl2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
+  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: ssubl2_duplhs
+; CHECK-NOT: ext.16b
+; CHECK: ssubl2.2d
+  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
+  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
+
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
+  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
+
+  %res = sub <2 x i64> %lhs.ext, %rhs.ext
+  ret <2 x i64> %res
+}
+
+define <8 x i8> @addhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn8b_natural:
+;CHECK: addhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %sum = add <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        ret <8 x i8> %narrowed
+}
+
+define <4 x i16> @addhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn4h_natural:
+;CHECK: addhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %sum = add <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        ret <4 x i16> %narrowed
+}
+
+define <2 x i32> @addhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2s_natural:
+;CHECK: addhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %sum = add <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        ret <2 x i32> %narrowed
+}
+
+define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: addhn2_16b_natural:
+;CHECK: addhn2.16b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %sum = add <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: addhn2_8h_natural:
+;CHECK: addhn2.8h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %sum = add <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: addhn2_4s_natural:
+;CHECK: addhn2.4s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %sum = add <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+define <8 x i8> @subhn8b_natural(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn8b_natural:
+;CHECK: subhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %diff = sub <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        ret <8 x i8> %narrowed
+}
+
+define <4 x i16> @subhn4h_natural(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn4h_natural:
+;CHECK: subhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %diff = sub <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        ret <4 x i16> %narrowed
+}
+
+define <2 x i32> @subhn2s_natural(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2s_natural:
+;CHECK: subhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %diff = sub <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        ret <2 x i32> %narrowed
+}
+
+define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, <8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn2_16b_natural:
+;CHECK: subhn2.16b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %diff = sub <8 x i16> %tmp1, %tmp2
+        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
+        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, <4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn2_8h_natural:
+;CHECK: subhn2.8h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %diff = sub <4 x i32> %tmp1, %tmp2
+        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
+        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
+        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, <2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2_4s_natural:
+;CHECK: subhn2.4s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %diff = sub <2 x i64> %tmp1, %tmp2
+        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
+        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
+        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
diff --git a/test/CodeGen/ARM64/vaddlv.ll b/test/CodeGen/ARM64/vaddlv.ll
new file mode 100644
index 0000000000..d4d4608ba0
--- /dev/null
+++ b/test/CodeGen/ARM64/vaddlv.ll
@@ -0,0 +1,26 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define i64 @test_vaddlv_s32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_s32
+; CHECK: saddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddlv.i = tail call i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  ret i64 %vaddlv.i
+}
+
+define i64 @test_vaddlv_u32(<2 x i32> %a1) nounwind readnone {
+; CHECK: test_vaddlv_u32
+; CHECK: uaddlp.1d v[[REGNUM:[0-9]+]], v[[INREG:[0-9]+]]
+; CHECK-NEXT: fmov x[[OUTREG:[0-9]+]], d[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddlv.i = tail call i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32> %a1) nounwind
+  ret i64 %vaddlv.i
+}
+
+declare i64 @llvm.arm64.neon.uaddlv.i64.v2i32(<2 x i32>) nounwind readnone
+
+declare i64 @llvm.arm64.neon.saddlv.i64.v2i32(<2 x i32>) nounwind readnone
+
diff --git a/test/CodeGen/ARM64/vaddv.ll b/test/CodeGen/ARM64/vaddv.ll
new file mode 100644
index 0000000000..44bfa845db
--- /dev/null
+++ b/test/CodeGen/ARM64/vaddv.ll
@@ -0,0 +1,233 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_s8:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_s16:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddv_s32(<2 x i32> %a1) {
+; CHECK-LABEL: test_vaddv_s32:
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define i64 @test_vaddv_s64(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_s64:
+; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
+; CHECK-NEXT: fmov x0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64> %a1)
+  ret i64 %vaddv.i
+}
+
+define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_u8:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
+; CHECK-LABEL: test_vaddv_u8_masked:
+; CHECK: addv.8b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
+  %0 = and i32 %vaddv.i, 511 ; 0x1ff
+  ret i32 %0
+}
+
+define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_u16:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
+; CHECK-LABEL: test_vaddv_u16_masked:
+; CHECK: addv.4h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
+  %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
+  ret i32 %0
+}
+
+define i32 @test_vaddv_u32(<2 x i32> %a1) {
+; CHECK-LABEL: test_vaddv_u32:
+; 2 x i32 is not supported by the ISA, thus, this is a special case
+; CHECK: addp.2s v[[REGNUM:[0-9]+]], v0, v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define float @test_vaddv_f32(<2 x float> %a1) {
+; CHECK-LABEL: test_vaddv_f32:
+; CHECK: faddp.2s s0, v0
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
+  ret float %vaddv.i
+}
+
+define float @test_vaddv_v4f32(<4 x float> %a1) {
+; CHECK-LABEL: test_vaddv_v4f32:
+; CHECK: faddp.4s [[REGNUM:v[0-9]+]], v0, v0
+; CHECK: faddp.2s s0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
+  ret float %vaddv.i
+}
+
+define double @test_vaddv_f64(<2 x double> %a1) {
+; CHECK-LABEL: test_vaddv_f64:
+; CHECK: faddp.2d d0, v0
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
+  ret double %vaddv.i
+}
+
+define i64 @test_vaddv_u64(<2 x i64> %a1) {
+; CHECK-LABEL: test_vaddv_u64:
+; CHECK: addp.2d [[REGNUM:d[0-9]+]], v0
+; CHECK-NEXT: fmov x0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
+  ret i64 %vaddv.i
+}
+
+define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
+; CHECK-LABEL: test_vaddvq_s8:
+; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.b w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
+; CHECK-LABEL: test_vaddvq_s16:
+; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: smov.h w0, v[[REGNUM]][0]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_s32(<4 x i32> %a1) {
+; CHECK-LABEL: test_vaddvq_s32:
+; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov w0, [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
+; CHECK-LABEL: test_vaddvq_u8:
+; CHECK: addv.16b b[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
+  %0 = trunc i32 %vaddv.i to i8
+  ret i8 %0
+}
+
+define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
+; CHECK-LABEL: test_vaddvq_u16:
+; CHECK: addv.8h h[[REGNUM:[0-9]+]], v0
+; CHECK-NEXT: fmov w0, s[[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
+  %0 = trunc i32 %vaddv.i to i16
+  ret i16 %0
+}
+
+define i32 @test_vaddvq_u32(<4 x i32> %a1) {
+; CHECK-LABEL: test_vaddvq_u32:
+; CHECK: addv.4s [[REGNUM:s[0-9]+]], v0
+; CHECK-NEXT: fmov [[FMOVRES:w[0-9]+]], [[REGNUM]]
+; CHECK-NEXT: ret
+entry:
+  %vaddv.i = tail call i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
+  ret i32 %vaddv.i
+}
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v16i8(<16 x i8>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v4i32(<4 x i32>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v8i16(<8 x i16>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v16i8(<16 x i8>)
+
+declare i64 @llvm.arm64.neon.uaddv.i64.v2i64(<2 x i64>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v2i32(<2 x i32>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.arm64.neon.uaddv.i32.v8i8(<8 x i8>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v2i32(<2 x i32>)
+
+declare i64 @llvm.arm64.neon.saddv.i64.v2i64(<2 x i64>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v4i16(<4 x i16>)
+
+declare i32 @llvm.arm64.neon.saddv.i32.v8i8(<8 x i8>)
+
+declare float @llvm.arm64.neon.faddv.f32.v2f32(<2 x float> %a1)
+declare float @llvm.arm64.neon.faddv.f32.v4f32(<4 x float> %a1)
+declare double @llvm.arm64.neon.faddv.f64.v2f64(<2 x double> %a1)
diff --git a/test/CodeGen/ARM64/variadic-aapcs.ll b/test/CodeGen/ARM64/variadic-aapcs.ll
new file mode 100644
index 0000000000..ac66902fa6
--- /dev/null
+++ b/test/CodeGen/ARM64/variadic-aapcs.ll
@@ -0,0 +1,143 @@
+; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s
+
+%va_list = type {i8*, i8*, i8*, i32, i32}
+
+@var = global %va_list zeroinitializer, align 8
+
+declare void @llvm.va_start(i8*)
+
+define void @test_simple(i32 %n, ...) {
+; CHECK-LABEL: test_simple:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
+
+; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+
+; CHECK: stp x1, x2, [sp, #[[GR_BASE:[0-9]+]]]
+; ... omit middle ones ...
+; CHECK: str x7, [sp, #
+
+; CHECK: stp q0, q1, [sp]
+; ... omit middle ones ...
+; CHECK: stp q6, q7, [sp, #
+
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+
+; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
+; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #56
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+
+; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
+; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #128
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #55
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+
+; CHECK: orr [[VR_OFFS:w[0-9]+]], wzr, #0xffffff80
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+
+  ret void
+}
+
+define void @test_fewargs(i32 %n, i32 %n1, i32 %n2, float %m, ...) {
+; CHECK-LABEL: test_fewargs:
+; CHECK: sub sp, sp, #[[STACKSIZE:[0-9]+]]
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #[[STACKSIZE]]
+
+; CHECK: adrp x[[VA_LIST_HI:[0-9]+]], var
+
+; CHECK: stp x3, x4, [sp, #[[GR_BASE:[0-9]+]]]
+; ... omit middle ones ...
+; CHECK: str x7, [sp, #
+
+; CHECK: stp q1, q2, [sp]
+; ... omit middle ones ...
+; CHECK: str q7, [sp, #
+
+; CHECK: str [[STACK_TOP]], [x[[VA_LIST_HI]], :lo12:var]
+
+; CHECK: add [[GR_TOPTMP:x[0-9]+]], sp, #[[GR_BASE]]
+; CHECK: add [[GR_TOP:x[0-9]+]], [[GR_TOPTMP]], #40
+; CHECK: add x[[VA_LIST:[0-9]+]], {{x[0-9]+}}, :lo12:var
+; CHECK: str [[GR_TOP]], [x[[VA_LIST]], #8]
+
+; CHECK: mov [[VR_TOPTMP:x[0-9]+]], sp
+; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #112
+; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16]
+
+; CHECK: movn [[GR_OFFS:w[0-9]+]], #39
+; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24]
+
+; CHECK: movn [[VR_OFFS:w[0-9]+]], #111
+; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+
+  ret void
+}
+
+define void @test_nospare([8 x i64], [8 x float], ...) {
+; CHECK-LABEL: test_nospare:
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+; CHECK-NOT: sub sp, sp
+; CHECK: mov [[STACK:x[0-9]+]], sp
+; CHECK: str [[STACK]], [{{x[0-9]+}}, :lo12:var]
+
+  ret void
+}
+
+; If there are non-variadic arguments on the stack (here two i64s) then the
+; __stack field should point just past them.
+define void @test_offsetstack([10 x i64], [3 x float], ...) {
+; CHECK-LABEL: test_offsetstack:
+; CHECK: sub sp, sp, #80
+; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
+; CHECK: str [[STACK_TOP]], [{{x[0-9]+}}, :lo12:var]
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_start(i8* %addr)
+  ret void
+}
+
+declare void @llvm.va_end(i8*)
+
+define void @test_va_end() nounwind {
+; CHECK-LABEL: test_va_end:
+; CHECK-NEXT: BB#0
+
+  %addr = bitcast %va_list* @var to i8*
+  call void @llvm.va_end(i8* %addr)
+
+  ret void
+; CHECK-NEXT: ret
+}
+
+declare void @llvm.va_copy(i8* %dest, i8* %src)
+
+@second_list = global %va_list zeroinitializer
+
+define void @test_va_copy() {
+; CHECK-LABEL: test_va_copy:
+  %srcaddr = bitcast %va_list* @var to i8*
+  %dstaddr = bitcast %va_list* @second_list to i8*
+  call void @llvm.va_copy(i8* %dstaddr, i8* %srcaddr)
+
+; CHECK: add x[[SRC:[0-9]+]], {{x[0-9]+}}, :lo12:var
+
+; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]]]
+; CHECK: add x[[DST:[0-9]+]], {{x[0-9]+}}, :lo12:second_list
+; CHECK: str [[BLOCK]], [x[[DST]]]
+
+; CHECK: ldr [[BLOCK:q[0-9]+]], [x[[SRC]], #16]
+; CHECK: str [[BLOCK]], [x[[DST]], #16]
+  ret void
+; CHECK: ret
+}
diff --git a/test/CodeGen/ARM64/vbitwise.ll b/test/CodeGen/ARM64/vbitwise.ll
new file mode 100644
index 0000000000..7d8378de29
--- /dev/null
+++ b/test/CodeGen/ARM64/vbitwise.ll
@@ -0,0 +1,91 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @rbit_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: rbit_8b:
+;CHECK: rbit.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @rbit_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: rbit_16b:
+;CHECK: rbit.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.rbit.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.rbit.v16i8(<16 x i8>) nounwind readnone
+
+define <8 x i16> @sxtl8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sxtl8h:
+;CHECK: sshll.8h
+	%tmp1 = load <8 x i8>* %A
+  %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+  ret <8 x i16> %tmp2
+}
+
+define <8 x i16> @uxtl8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uxtl8h:
+;CHECK: ushll.8h
+	%tmp1 = load <8 x i8>* %A
+  %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+  ret <8 x i16> %tmp2
+}
+
+define <4 x i32> @sxtl4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sxtl4s:
+;CHECK: sshll.4s
+	%tmp1 = load <4 x i16>* %A
+  %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+  ret <4 x i32> %tmp2
+}
+
+define <4 x i32> @uxtl4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uxtl4s:
+;CHECK: ushll.4s
+	%tmp1 = load <4 x i16>* %A
+  %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+  ret <4 x i32> %tmp2
+}
+
+define <2 x i64> @sxtl2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sxtl2d:
+;CHECK: sshll.2d
+	%tmp1 = load <2 x i32>* %A
+  %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+  ret <2 x i64> %tmp2
+}
+
+define <2 x i64> @uxtl2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uxtl2d:
+;CHECK: ushll.2d
+	%tmp1 = load <2 x i32>* %A
+  %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+  ret <2 x i64> %tmp2
+}
+
+; Check for incorrect use of vector bic.
+; rdar://11553859
+define void @test_vsliq(i8* nocapture %src, i8* nocapture %dest) nounwind noinline ssp {
+entry:
+; CHECK-LABEL: test_vsliq:
+; CHECK-NOT: bic
+; CHECK: movi.2d [[REG1:v[0-9]+]], #0x0000ff000000ff
+; CHECK: and.16b v{{[0-9]+}}, v{{[0-9]+}}, [[REG1]]
+  %0 = bitcast i8* %src to <16 x i8>*
+  %1 = load <16 x i8>* %0, align 16
+  %and.i = and <16 x i8> %1, <i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 -1, i8 0, i8 0, i8 0>
+  %2 = bitcast <16 x i8> %and.i to <8 x i16>
+  %vshl_n = shl <8 x i16> %2, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  %3 = or <8 x i16> %2, %vshl_n
+  %4 = bitcast <8 x i16> %3 to <4 x i32>
+  %vshl_n8 = shl <4 x i32> %4, <i32 16, i32 16, i32 16, i32 16>
+  %5 = or <4 x i32> %4, %vshl_n8
+  %6 = bitcast <4 x i32> %5 to <16 x i8>
+  %7 = bitcast i8* %dest to <16 x i8>*
+  store <16 x i8> %6, <16 x i8>* %7, align 16
+  ret void
+}
diff --git a/test/CodeGen/ARM64/vclz.ll b/test/CodeGen/ARM64/vclz.ll
new file mode 100644
index 0000000000..ddc09ed85f
--- /dev/null
+++ b/test/CodeGen/ARM64/vclz.ll
@@ -0,0 +1,109 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define <8 x i8> @test_vclz_u8(<8 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u8:
+  ; CHECK: clz.8b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
+  ret <8 x i8> %vclz.i
+}
+
+define <8 x i8> @test_vclz_s8(<8 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s8:
+  ; CHECK: clz.8b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %a, i1 false) nounwind
+  ret <8 x i8> %vclz.i
+}
+
+define <4 x i16> @test_vclz_u16(<4 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u16:
+  ; CHECK: clz.4h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
+  ret <4 x i16> %vclz1.i
+}
+
+define <4 x i16> @test_vclz_s16(<4 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s16:
+  ; CHECK: clz.4h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %a, i1 false) nounwind
+  ret <4 x i16> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_u32(<2 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_u32:
+  ; CHECK: clz.2s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
+  ret <2 x i32> %vclz1.i
+}
+
+define <2 x i32> @test_vclz_s32(<2 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclz_s32:
+  ; CHECK: clz.2s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %a, i1 false) nounwind
+  ret <2 x i32> %vclz1.i
+}
+
+define <16 x i8> @test_vclzq_u8(<16 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u8:
+  ; CHECK: clz.16b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
+  ret <16 x i8> %vclz.i
+}
+
+define <16 x i8> @test_vclzq_s8(<16 x i8> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s8:
+  ; CHECK: clz.16b v0, v0
+  ; CHECK-NEXT: ret
+  %vclz.i = tail call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %a, i1 false) nounwind
+  ret <16 x i8> %vclz.i
+}
+
+define <8 x i16> @test_vclzq_u16(<8 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u16:
+  ; CHECK: clz.8h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
+  ret <8 x i16> %vclz1.i
+}
+
+define <8 x i16> @test_vclzq_s16(<8 x i16> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s16:
+  ; CHECK: clz.8h v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %a, i1 false) nounwind
+  ret <8 x i16> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_u32(<4 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_u32:
+  ; CHECK: clz.4s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
+  ret <4 x i32> %vclz1.i
+}
+
+define <4 x i32> @test_vclzq_s32(<4 x i32> %a) nounwind readnone ssp {
+  ; CHECK-LABEL: test_vclzq_s32:
+  ; CHECK: clz.4s v0, v0
+  ; CHECK-NEXT: ret
+  %vclz1.i = tail call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %a, i1 false) nounwind
+  ret <4 x i32> %vclz1.i
+}
+
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
+
+declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>, i1) nounwind readnone
+
+declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>, i1) nounwind readnone
+
+declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
+
+declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
+
+declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>, i1) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcmp.ll b/test/CodeGen/ARM64/vcmp.ll
new file mode 100644
index 0000000000..f9275b825f
--- /dev/null
+++ b/test/CodeGen/ARM64/vcmp.ll
@@ -0,0 +1,227 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+
+define void @fcmltz_4s(<4 x float> %a, <4 x i16>* %p) nounwind {
+;CHECK-LABEL: fcmltz_4s:
+;CHECK: fcmlt.4s [[REG:v[0-9]+]], v0, #0
+;CHECK-NEXT: xtn.4h v[[REG_1:[0-9]+]], [[REG]]
+;CHECK-NEXT: str d[[REG_1]], [x0]
+;CHECK-NEXT: ret
+  %tmp = fcmp olt <4 x float> %a, zeroinitializer
+  %tmp2 = sext <4 x i1> %tmp to <4 x i16>
+  store <4 x i16> %tmp2, <4 x i16>* %p, align 8
+  ret void
+}
+
+define <2 x i32> @facge_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: facge_2s:
+;CHECK: facge.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @facge_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: facge_4s:
+;CHECK: facge.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @facge_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: facge_2d:
+;CHECK: facge.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.facge.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.facge.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.facge.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x i32> @facgt_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: facgt_2s:
+;CHECK: facgt.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @facgt_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: facgt_4s:
+;CHECK: facgt.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @facgt_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: facgt_2d:
+;CHECK: facgt.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.facgt.v2i32.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.facgt.v4i32.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.facgt.v2i64.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define i32 @facge_s(float %A, float %B) nounwind {
+; CHECK-LABEL: facge_s:
+; CHECK: facge {{s[0-9]+}}, s0, s1
+  %mask = call i32 @llvm.arm64.neon.facge.i32.f32(float %A, float %B)
+  ret i32 %mask
+}
+
+define i64 @facge_d(double %A, double %B) nounwind {
+; CHECK-LABEL: facge_d:
+; CHECK: facge {{d[0-9]+}}, d0, d1
+  %mask = call i64 @llvm.arm64.neon.facge.i64.f64(double %A, double %B)
+  ret i64 %mask
+}
+
+declare i64 @llvm.arm64.neon.facge.i64.f64(double, double)
+declare i32 @llvm.arm64.neon.facge.i32.f32(float, float)
+
+define i32 @facgt_s(float %A, float %B) nounwind {
+; CHECK-LABEL: facgt_s:
+; CHECK: facgt {{s[0-9]+}}, s0, s1
+  %mask = call i32 @llvm.arm64.neon.facgt.i32.f32(float %A, float %B)
+  ret i32 %mask
+}
+
+define i64 @facgt_d(double %A, double %B) nounwind {
+; CHECK-LABEL: facgt_d:
+; CHECK: facgt {{d[0-9]+}}, d0, d1
+  %mask = call i64 @llvm.arm64.neon.facgt.i64.f64(double %A, double %B)
+  ret i64 %mask
+}
+
+declare i64 @llvm.arm64.neon.facgt.i64.f64(double, double)
+declare i32 @llvm.arm64.neon.facgt.i32.f32(float, float)
+
+define <8 x i8> @cmtst_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: cmtst_8b:
+;CHECK: cmtst.8b
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %commonbits = and <8 x i8> %tmp1, %tmp2
+  %mask = icmp ne <8 x i8> %commonbits, zeroinitializer
+  %res = sext <8 x i1> %mask to <8 x i8>
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @cmtst_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: cmtst_16b:
+;CHECK: cmtst.16b
+  %tmp1 = load <16 x i8>* %A
+  %tmp2 = load <16 x i8>* %B
+  %commonbits = and <16 x i8> %tmp1, %tmp2
+  %mask = icmp ne <16 x i8> %commonbits, zeroinitializer
+  %res = sext <16 x i1> %mask to <16 x i8>
+  ret <16 x i8> %res
+}
+
+define <4 x i16> @cmtst_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: cmtst_4h:
+;CHECK: cmtst.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %commonbits = and <4 x i16> %tmp1, %tmp2
+  %mask = icmp ne <4 x i16> %commonbits, zeroinitializer
+  %res = sext <4 x i1> %mask to <4 x i16>
+  ret <4 x i16> %res
+}
+
+define <8 x i16> @cmtst_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: cmtst_8h:
+;CHECK: cmtst.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %commonbits = and <8 x i16> %tmp1, %tmp2
+  %mask = icmp ne <8 x i16> %commonbits, zeroinitializer
+  %res = sext <8 x i1> %mask to <8 x i16>
+  ret <8 x i16> %res
+}
+
+define <2 x i32> @cmtst_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: cmtst_2s:
+;CHECK: cmtst.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %commonbits = and <2 x i32> %tmp1, %tmp2
+  %mask = icmp ne <2 x i32> %commonbits, zeroinitializer
+  %res = sext <2 x i1> %mask to <2 x i32>
+  ret <2 x i32> %res
+}
+
+define <4 x i32> @cmtst_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: cmtst_4s:
+;CHECK: cmtst.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %commonbits = and <4 x i32> %tmp1, %tmp2
+  %mask = icmp ne <4 x i32> %commonbits, zeroinitializer
+  %res = sext <4 x i1> %mask to <4 x i32>
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @cmtst_2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: cmtst_2d:
+;CHECK: cmtst.2d
+  %tmp1 = load <2 x i64>* %A
+  %tmp2 = load <2 x i64>* %B
+  %commonbits = and <2 x i64> %tmp1, %tmp2
+  %mask = icmp ne <2 x i64> %commonbits, zeroinitializer
+  %res = sext <2 x i1> %mask to <2 x i64>
+  ret <2 x i64> %res
+}
+
+define <1 x i64> @fcmeq_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmeq_d:
+; CHECK: fcmeq {{d[0-9]+}}, d0, d1
+  %tst = fcmp oeq <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmge_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmge_d:
+; CHECK: fcmge {{d[0-9]+}}, d0, d1
+  %tst = fcmp oge <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmle_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmle_d:
+; CHECK: fcmge {{d[0-9]+}}, d1, d0
+  %tst = fcmp ole <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmgt_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmgt_d:
+; CHECK: fcmgt {{d[0-9]+}}, d0, d1
+  %tst = fcmp ogt <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
+
+define <1 x i64> @fcmlt_d(<1 x double> %A, <1 x double> %B) nounwind {
+; CHECK-LABEL: fcmlt_d:
+; CHECK: fcmgt {{d[0-9]+}}, d1, d0
+  %tst = fcmp olt <1 x double> %A, %B
+  %mask = sext <1 x i1> %tst to <1 x i64>
+  ret <1 x i64> %mask
+}
diff --git a/test/CodeGen/ARM64/vcnt.ll b/test/CodeGen/ARM64/vcnt.ll
new file mode 100644
index 0000000000..e00658a4bd
--- /dev/null
+++ b/test/CodeGen/ARM64/vcnt.ll
@@ -0,0 +1,56 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @cls_8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_8b:
+;CHECK: cls.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8> %tmp1)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @cls_16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: cls_16b:
+;CHECK: cls.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8> %tmp1)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @cls_4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_4h:
+;CHECK: cls.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16> %tmp1)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @cls_8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: cls_8h:
+;CHECK: cls.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16> %tmp1)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @cls_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_2s:
+;CHECK: cls.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @cls_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: cls_4s:
+;CHECK: cls.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.cls.v8i8(<8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.cls.v16i8(<16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.cls.v4i16(<4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.cls.v8i16(<8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.cls.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.cls.v4i32(<4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcombine.ll b/test/CodeGen/ARM64/vcombine.ll
new file mode 100644
index 0000000000..16f591e378
--- /dev/null
+++ b/test/CodeGen/ARM64/vcombine.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+; LowerCONCAT_VECTORS() was reversing the order of two parts.
+; rdar://11558157
+; rdar://11559553
+define <16 x i8> @test(<16 x i8> %q0, <16 x i8> %q1, i8* nocapture %dest) nounwind {
+entry:
+; CHECK-LABEL: test:
+; CHECK: ins.d v0[1], v1[0]
+  %0 = bitcast <16 x i8> %q0 to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> zeroinitializer
+  %1 = bitcast <16 x i8> %q1 to <2 x i64>
+  %shuffle.i4 = shufflevector <2 x i64> %1, <2 x i64> undef, <1 x i32> zeroinitializer
+  %shuffle.i3 = shufflevector <1 x i64> %shuffle.i, <1 x i64> %shuffle.i4, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i3 to <16 x i8>
+  ret <16 x i8> %2
+}
diff --git a/test/CodeGen/ARM64/vcvt.ll b/test/CodeGen/ARM64/vcvt.ll
new file mode 100644
index 0000000000..19bb8cb8dc
--- /dev/null
+++ b/test/CodeGen/ARM64/vcvt.ll
@@ -0,0 +1,686 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x i32> @fcvtas_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtas_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtas_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtas_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtas_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtas_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtas.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtas.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtas.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtas.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtau_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtau_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtau_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtau_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtau_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtau_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtau.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtau.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtau.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtau.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtms_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtms_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtms_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtms_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtms_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtms_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtms.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtms.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtms.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtms.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtmu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtmu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtmu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtmu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtmu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtmu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtmu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtmu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtmu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtmu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtps_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtps_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtps_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtps_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtps_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtps_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtps.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtps.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtps.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtps.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtpu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtpu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtpu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtpu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtpu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtpu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtpu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtpu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtpu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtpu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtns_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtns_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtns_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtns_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtns_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtns_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtns.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtns.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtns.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtns.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtnu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtnu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float> %A)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtnu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtnu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float> %A)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtnu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtnu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtnu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double> %A)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.fcvtnu.v2i32.v2f32(<2 x float>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.fcvtnu.v4i32.v4f32(<4 x float>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.fcvtnu.v2i64.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtzs_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzs_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <2 x float> %A to <2 x i32>
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzs_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzs_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <4 x float> %A to <4 x i32>
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzs_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptosi <2 x double> %A to <2 x i64>
+	ret <2 x i64> %tmp3
+}
+
+
+define <2 x i32> @fcvtzu_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzu_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <2 x float> %A to <2 x i32>
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzu_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzu_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <4 x float> %A to <4 x i32>
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzu_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = fptoui <2 x double> %A to <2 x i64>
+	ret <2 x i64> %tmp3
+}
+
+define <2 x float> @frinta_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frinta_2s:
+;CHECK-NOT: ld1
+;CHECK: frinta.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.round.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frinta_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frinta_4s:
+;CHECK-NOT: ld1
+;CHECK: frinta.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.round.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frinta_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frinta_2d:
+;CHECK-NOT: ld1
+;CHECK: frinta.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.round.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.round.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.round.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.round.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frinti_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frinti_2s:
+;CHECK-NOT: ld1
+;CHECK: frinti.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frinti_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frinti_4s:
+;CHECK-NOT: ld1
+;CHECK: frinti.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frinti_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frinti_2d:
+;CHECK-NOT: ld1
+;CHECK: frinti.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.nearbyint.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintm_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintm_2s:
+;CHECK-NOT: ld1
+;CHECK: frintm.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.floor.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintm_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintm_4s:
+;CHECK-NOT: ld1
+;CHECK: frintm.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.floor.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintm_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintm_2d:
+;CHECK-NOT: ld1
+;CHECK: frintm.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.floor.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.floor.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.floor.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintn_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintn_2s:
+;CHECK-NOT: ld1
+;CHECK: frintn.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintn_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintn_4s:
+;CHECK-NOT: ld1
+;CHECK: frintn.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintn_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintn_2d:
+;CHECK-NOT: ld1
+;CHECK: frintn.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.frintn.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.frintn.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.frintn.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintp_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintp_2s:
+;CHECK-NOT: ld1
+;CHECK: frintp.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.ceil.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintp_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintp_4s:
+;CHECK-NOT: ld1
+;CHECK: frintp.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintp_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintp_2d:
+;CHECK-NOT: ld1
+;CHECK: frintp.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.ceil.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.ceil.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.ceil.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintx_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintx_2s:
+;CHECK-NOT: ld1
+;CHECK: frintx.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.rint.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintx_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintx_4s:
+;CHECK-NOT: ld1
+;CHECK: frintx.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.rint.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintx_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintx_2d:
+;CHECK-NOT: ld1
+;CHECK: frintx.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.rint.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.rint.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.rint.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.rint.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frintz_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: frintz_2s:
+;CHECK-NOT: ld1
+;CHECK: frintz.2s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.trunc.v2f32(<2 x float> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frintz_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: frintz_4s:
+;CHECK-NOT: ld1
+;CHECK: frintz.4s v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.trunc.v4f32(<4 x float> %A)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frintz_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: frintz_2d:
+;CHECK-NOT: ld1
+;CHECK: frintz.2d v0, v0
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.trunc.v2f64(<2 x double> %A)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.trunc.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.trunc.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.trunc.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @fcvtxn_2s(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtxn_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtxn v0.2s, v0.2d
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fcvtxn_4s(<2 x float> %ret, <2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtxn_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtxn2 v0.4s, v1.2d
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %A)
+        %res = shufflevector <2 x float> %ret, <2 x float> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+	ret <4 x float> %res
+}
+
+declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @fcvtzsc_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %A, i32 1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzsc_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %A, i32 1)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzsc_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzsc_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzs.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double> %A, i32 1)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxs.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+
+define <2 x i32> @fcvtzuc_2s(<2 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_2s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %A, i32 1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @fcvtzuc_4s(<4 x float> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_4s:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %A, i32 1)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @fcvtzuc_2d(<2 x double> %A) nounwind {
+;CHECK-LABEL: fcvtzuc_2d:
+;CHECK-NOT: ld1
+;CHECK: fcvtzu.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double> %A, i32 1)
+	ret <2 x i64> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.vcvtfp2fxu.v2i64.v2f64(<2 x double>, i32) nounwind readnone
+
+define <2 x float> @scvtf_2sc(<2 x i32> %A) nounwind {
+;CHECK-LABEL: scvtf_2sc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @scvtf_4sc(<4 x i32> %A) nounwind {
+;CHECK-LABEL: scvtf_4sc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @scvtf_2dc(<2 x i64> %A) nounwind {
+;CHECK-LABEL: scvtf_2dc:
+;CHECK-NOT: ld1
+;CHECK: scvtf.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+
+define <2 x float> @ucvtf_2sc(<2 x i32> %A) nounwind {
+;CHECK-LABEL: ucvtf_2sc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.2s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %A, i32 1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @ucvtf_4sc(<4 x i32> %A) nounwind {
+;CHECK-LABEL: ucvtf_4sc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.4s v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %A, i32 1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @ucvtf_2dc(<2 x i64> %A) nounwind {
+;CHECK-LABEL: ucvtf_2dc:
+;CHECK-NOT: ld1
+;CHECK: ucvtf.2d v0, v0, #1
+;CHECK-NEXT: ret
+	%tmp3 = call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %A, i32 1)
+	ret <2 x double> %tmp3
+}
+
+
+;CHECK-LABEL: autogen_SD28458:
+;CHECK: fcvt
+;CHECK: ret
+define void @autogen_SD28458() {
+  %Tr53 = fptrunc <8 x double> undef to <8 x float>
+  store <8 x float> %Tr53, <8 x float>* undef
+  ret void
+}
+
+;CHECK-LABEL: autogen_SD19225:
+;CHECK: fcvt
+;CHECK: ret
+define void @autogen_SD19225() {
+  %A = load <8 x float>* undef
+  %Tr53 = fpext <8 x float> %A to <8 x double>
+  store <8 x double> %Tr53, <8 x double>* undef
+  ret void
+}
+
+declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_f.ll b/test/CodeGen/ARM64/vcvt_f.ll
new file mode 100644
index 0000000000..549d2f0aea
--- /dev/null
+++ b/test/CodeGen/ARM64/vcvt_f.ll
@@ -0,0 +1,82 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_f64_f32:
+  %vcvt1.i = fpext <2 x float> %x to <2 x double>
+; CHECK: fcvtl	v0.2d, v0.2s
+  ret <2 x double> %vcvt1.i
+; CHECK: ret
+}
+
+define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_high_f64_f32:
+  %cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  %vcvt1.i = fpext <2 x float> %cvt_in to <2 x double>
+; CHECK: fcvtl2	v0.2d, v0.4s
+  ret <2 x double> %vcvt1.i
+; CHECK: ret
+}
+
+define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_f32_f64:
+  %vcvt1.i = fptrunc <2 x double> %v to <2 x float>
+; CHECK: fcvtn
+  ret <2 x float> %vcvt1.i
+; CHECK: ret
+}
+
+define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvt_high_f32_f64:
+
+  %cvt = fptrunc <2 x double> %v to <2 x float>
+  %vcvt2.i = shufflevector <2 x float> %x, <2 x float> %cvt, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: fcvtn2
+  ret <4 x float> %vcvt2.i
+; CHECK: ret
+}
+
+define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvtx_f32_f64:
+  %vcvtx1.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+; CHECK: fcvtxn
+  ret <2 x float> %vcvtx1.i
+; CHECK: ret
+}
+
+define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
+; CHECK-LABEL: test_vcvtx_high_f32_f64:
+  %vcvtx2.i = tail call <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
+  %res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK: fcvtxn2
+  ret <4 x float> %res
+; CHECK: ret
+}
+
+
+declare <2 x double> @llvm.arm64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfp2df(<2 x float>) nounwind readnone
+
+declare <2 x float> @llvm.arm64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
+
+declare <2 x float> @llvm.arm64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
+
+define i16 @to_half(float %in) {
+; CHECK-LABEL: to_half:
+; CHECK: fcvt h[[HALFVAL:[0-9]+]], s0
+; CHECK: fmov w0, s[[HALFVAL]]
+
+  %res = call i16 @llvm.convert.to.fp16(float %in)
+  ret i16 %res
+}
+
+define float @from_half(i16 %in) {
+; CHECK-LABEL: from_half:
+; CHECK: fmov s[[HALFVAL:[0-9]+]], {{w[0-9]+}}
+; CHECK: fcvt s0, h[[HALFVAL]]
+  %res = call float @llvm.convert.from.fp16(i16 %in)
+  ret float %res
+}
+
+declare float @llvm.convert.from.fp16(i16) #1
+declare i16 @llvm.convert.to.fp16(float) #1
diff --git a/test/CodeGen/ARM64/vcvt_f32_su32.ll b/test/CodeGen/ARM64/vcvt_f32_su32.ll
new file mode 100644
index 0000000000..51e053d974
--- /dev/null
+++ b/test/CodeGen/ARM64/vcvt_f32_su32.ll
@@ -0,0 +1,73 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @ucvt(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: ucvt:
+; CHECK: ucvtf.2s  v0, v0
+; CHECK: ret
+
+  %vcvt.i = uitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <2 x float> @scvt(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: scvt:
+; CHECK: scvtf.2s  v0, v0
+; CHECK: ret
+  %vcvt.i = sitofp <2 x i32> %a to <2 x float>
+  ret <2 x float> %vcvt.i
+}
+
+define <4 x float> @ucvtq(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: ucvtq:
+; CHECK: ucvtf.4s  v0, v0
+; CHECK: ret
+  %vcvt.i = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @scvtq(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: scvtq:
+; CHECK: scvtf.4s  v0, v0
+; CHECK: ret
+  %vcvt.i = sitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %vcvt.i
+}
+
+define <4 x float> @cvtf16(<4 x i16> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16:
+; CHECK: fcvtl  v0.4s, v0.4h
+; CHECK-NEXT: ret
+  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %a) nounwind
+  ret <4 x float> %vcvt1.i
+}
+
+define <4 x float> @cvtf16_high(<8 x i16> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16_high:
+; CHECK: fcvtl2  v0.4s, v0.8h
+; CHECK-NEXT: ret
+  %in = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %vcvt1.i = tail call <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16> %in) nounwind
+  ret <4 x float> %vcvt1.i
+}
+
+
+
+define <4 x i16> @cvtf16f32(<4 x float> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf16f32:
+; CHECK: fcvtn  v0.4h, v0.4s
+; CHECK-NEXT: ret
+  %vcvt1.i = tail call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %a) nounwind
+  ret <4 x i16> %vcvt1.i
+}
+
+define <8 x i16> @cvtf16f32_high(<4 x i16> %low, <4 x float> %high_big) {
+; CHECK-LABEL: cvtf16f32_high:
+; CHECK: fcvtn2 v0.8h, v1.4s
+; CHECK-NEXT: ret
+  %high = call <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float> %high_big)
+  %res = shufflevector <4 x i16> %low, <4 x i16> %high, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+declare <4 x float> @llvm.arm64.neon.vcvthf2fp(<4 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.vcvtfp2hf(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_n.ll b/test/CodeGen/ARM64/vcvt_n.ll
new file mode 100644
index 0000000000..46de557b07
--- /dev/null
+++ b/test/CodeGen/ARM64/vcvt_n.ll
@@ -0,0 +1,49 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @cvtf32fxpu(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxpu:
+; CHECK: ucvtf.2s	v0, v0, #9
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %a, i32 9)
+  ret <2 x float> %vcvt_n1
+}
+
+define <2 x float> @cvtf32fxps(<2 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtf32fxps:
+; CHECK: scvtf.2s	v0, v0, #12
+; CHECK: ret
+  %vcvt_n1 = tail call <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %a, i32 12)
+  ret <2 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxpu(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxpu:
+; CHECK: ucvtf.4s	v0, v0, #18
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %a, i32 18)
+  ret <4 x float> %vcvt_n1
+}
+
+define <4 x float> @cvtqf32fxps(<4 x i32> %a) nounwind readnone ssp {
+; CHECK-LABEL: cvtqf32fxps:
+; CHECK: scvtf.4s	v0, v0, #30
+; CHECK: ret
+  %vcvt_n1 = tail call <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %a, i32 30)
+  ret <4 x float> %vcvt_n1
+}
+define <2 x double> @f1(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64> %a, i32 12)
+  ret <2 x double> %vcvt_n1
+}
+
+define <2 x double> @f2(<2 x i64> %a) nounwind readnone ssp {
+  %vcvt_n1 = tail call <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64> %a, i32 9)
+  ret <2 x double> %vcvt_n1
+}
+
+declare <4 x float> @llvm.arm64.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.arm64.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x float> @llvm.arm64.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfxu2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.vcvtfxs2fp.v2f64.v2i64(<2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vcvt_su32_f32.ll b/test/CodeGen/ARM64/vcvt_su32_f32.ll
new file mode 100644
index 0000000000..8c82fa095c
--- /dev/null
+++ b/test/CodeGen/ARM64/vcvt_su32_f32.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x i32> @c1(<2 x float> %a) nounwind readnone ssp {
+; CHECK: c1
+; CHECK: fcvtzs.2s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptosi <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <2 x i32> @c2(<2 x float> %a) nounwind readnone ssp {
+; CHECK: c2
+; CHECK: fcvtzu.2s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptoui <2 x float> %a to <2 x i32>
+  ret <2 x i32> %vcvt.i
+}
+
+define <4 x i32> @c3(<4 x float> %a) nounwind readnone ssp {
+; CHECK: c3
+; CHECK: fcvtzs.4s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptosi <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
+define <4 x i32> @c4(<4 x float> %a) nounwind readnone ssp {
+; CHECK: c4
+; CHECK: fcvtzu.4s	v0, v0
+; CHECK: ret
+  %vcvt.i = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %vcvt.i
+}
+
diff --git a/test/CodeGen/ARM64/vcvtxd_f32_f64.ll b/test/CodeGen/ARM64/vcvtxd_f32_f64.ll
new file mode 100644
index 0000000000..bbe8f0b386
--- /dev/null
+++ b/test/CodeGen/ARM64/vcvtxd_f32_f64.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define float @fcvtxn(double %a) {
+; CHECK-LABEL: fcvtxn:
+; CHECK: fcvtxn s0, d0
+; CHECK-NEXT: ret
+  %vcvtxd.i = tail call float @llvm.arm64.sisd.fcvtxn(double %a) nounwind
+  ret float %vcvtxd.i
+}
+
+declare float @llvm.arm64.sisd.fcvtxn(double) nounwind readnone
diff --git a/test/CodeGen/ARM64/vecCmpBr.ll b/test/CodeGen/ARM64/vecCmpBr.ll
new file mode 100644
index 0000000000..e23ef256b4
--- /dev/null
+++ b/test/CodeGen/ARM64/vecCmpBr.ll
@@ -0,0 +1,207 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+; ModuleID = 'arm64_vecCmpBr.c'
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+
+define i32 @anyZero64(<4 x i16> %a) #0 {
+; CHECK: _anyZero64:
+; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @bar(...) #1
+
+define i32 @anyZero128(<8 x i16> %a) #0 {
+; CHECK: _anyZero128:
+; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @anyNonZero64(<4 x i16> %a) #0 {
+; CHECK: _anyNonZero64:
+; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @anyNonZero128(<8 x i16> %a) #0 {
+; CHECK: _anyNonZero128:
+; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allZero64(<4 x i16> %a) #0 {
+; CHECK: _allZero64:
+; CHECK: umaxv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allZero128(<8 x i16> %a) #0 {
+; CHECK: _allZero128:
+; CHECK: umaxv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: b _bar
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vmaxv.i = tail call i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vmaxv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %if.then, label %return
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allNonZero64(<4 x i16> %a) #0 {
+; CHECK: _allNonZero64:
+; CHECK: uminv.8b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <4 x i16> %a to <8 x i8>
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+define i32 @allNonZero128(<8 x i16> %a) #0 {
+; CHECK: _allNonZero128:
+; CHECK: uminv.16b b[[REGNO1:[0-9]+]], v0
+; CHECK-NEXT: fmov w[[REGNO2:[0-9]+]], s[[REGNO1]]
+; CHECK-NEXT: cbz w[[REGNO2]], [[LABEL:[A-Z_0-9]+]]
+; CHECK: [[LABEL]]:
+; CHECK-NEXT: movz w0, #0
+entry:
+  %0 = bitcast <8 x i16> %a to <16 x i8>
+  %vminv.i = tail call i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8> %0) #3
+  %1 = trunc i32 %vminv.i to i8
+  %tobool = icmp eq i8 %1, 0
+  br i1 %tobool, label %return, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call1 = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #4
+  br label %return
+
+return:                                           ; preds = %entry, %if.then
+  %retval.0 = phi i32 [ %call1, %if.then ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+declare i32 @llvm.arm64.neon.umaxv.i32.v16i8(<16 x i8>) #2
+
+declare i32 @llvm.arm64.neon.umaxv.i32.v8i8(<8 x i8>) #2
+
+declare i32 @llvm.arm64.neon.uminv.i32.v16i8(<16 x i8>) #2
+
+declare i32 @llvm.arm64.neon.uminv.i32.v8i8(<8 x i8>) #2
+
+attributes #0 = { nounwind ssp "target-cpu"="cyclone" }
+attributes #1 = { "target-cpu"="cyclone" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+attributes #4 = { nobuiltin nounwind }
diff --git a/test/CodeGen/ARM64/vecFold.ll b/test/CodeGen/ARM64/vecFold.ll
new file mode 100644
index 0000000000..6888932f2c
--- /dev/null
+++ b/test/CodeGen/ARM64/vecFold.ll
@@ -0,0 +1,145 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple -o - %s| FileCheck %s
+
+define <16 x i8> @foov16i8(<8 x i16> %a0, <8 x i16> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov16i8:
+  %vshrn_low_shift = lshr <8 x i16> %a0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %vshrn_low = trunc <8 x i16> %vshrn_low_shift to <8 x i8>
+  %vshrn_high_shift = lshr <8 x i16> %b0, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  %vshrn_high = trunc <8 x i16> %vshrn_high_shift to <8 x i8>
+; CHECK: shrn.8b v0, v0, #5
+; CHECK-NEXT: shrn2.16b v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <8 x i8> %vshrn_low to <1 x i64>
+  %2 = bitcast <8 x i8> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <16 x i8>
+  ret <16 x i8> %3
+}
+
+define <8 x i16> @foov8i16(<4 x i32> %a0, <4 x i32> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov8i16:
+  %vshrn_low_shift = lshr <4 x i32> %a0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_low = trunc <4 x i32> %vshrn_low_shift to <4 x i16>
+  %vshrn_high_shift = lshr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
+; CHECK: shrn.4h v0, v0, #5
+; CHECK-NEXT: shrn2.8h v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vshrn_low to <1 x i64>
+  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <4 x i32> @foov4i32(<2 x i64> %a0, <2 x i64> %b0) nounwind readnone ssp {
+; CHECK-LABEL: foov4i32:
+  %vshrn_low_shift = lshr <2 x i64> %a0, <i64 5, i64 5>
+  %vshrn_low = trunc <2 x i64> %vshrn_low_shift to <2 x i32>
+  %vshrn_high_shift = lshr <2 x i64> %b0, <i64 5, i64 5>
+  %vshrn_high = trunc <2 x i64> %vshrn_high_shift to <2 x i32>
+; CHECK: shrn.2s v0, v0, #5
+; CHECK-NEXT: shrn2.4s v0, v1, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <2 x i32> %vshrn_low to <1 x i64>
+  %2 = bitcast <2 x i32> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <4 x i32>
+  ret <4 x i32> %3
+}
+
+define <8 x i16> @bar(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: bar:
+  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vaddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+; CHECK: addhn.4h	v0, v0, v1
+; CHECK-NEXT: addhn2.8h	v0, v2, v3
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vaddhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @baz(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: baz:
+  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vshrn_high_shift = ashr <4 x i32> %b0, <i32 5, i32 5, i32 5, i32 5>
+  %vshrn_high = trunc <4 x i32> %vshrn_high_shift to <4 x i16>
+; CHECK: addhn.4h	v0, v0, v1
+; CHECK-NEXT: shrn2.8h	v0, v2, #5
+; CHECK-NEXT: ret
+  %1 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vshrn_high to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @raddhn(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: raddhn:
+entry:
+; CHECK: 	raddhn.4h	v0, v0, v1
+; CHECK-NEXT: 	raddhn2.8h	v0, v2, v3
+; CHECK-NEXT: 	ret
+  %vraddhn2.i = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %a0, <4 x i32> %a1) nounwind
+  %vraddhn2.i10 = tail call <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+  %0 = bitcast <4 x i16> %vraddhn2.i to <1 x i64>
+  %1 = bitcast <4 x i16> %vraddhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %0, <1 x i64> %1, <2 x i32> <i32 0, i32 1>
+  %2 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @vrshrn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
+; CHECK-LABEL: vrshrn:
+; CHECK: rshrn.8b	v0, v0, #5
+; CHECK-NEXT: rshrn2.16b	v0, v2, #6
+; CHECK-NEXT: ret
+  %vrshrn_n1 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %a0, i32 5)
+  %vrshrn_n4 = tail call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %b0, i32 6)
+  %1 = bitcast <8 x i8> %vrshrn_n1 to <1 x i64>
+  %2 = bitcast <8 x i8> %vrshrn_n4 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @vrsubhn(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %b0, <8 x i16> %b1) nounwind readnone ssp {
+; CHECK-LABEL: vrsubhn:
+; CHECK: rsubhn.8b	v0, v0, v1
+; CHECK: rsubhn2.16b	v0, v2, v3
+; CHECK-NEXT: 	ret
+  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a0, <8 x i16> %a1) nounwind
+  %vrsubhn2.i10 = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %b0, <8 x i16> %b1) nounwind
+  %1 = bitcast <8 x i8> %vrsubhn2.i to <1 x i64>
+  %2 = bitcast <8 x i8> %vrsubhn2.i10 to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @noOpt1(<2 x i32> %a0, <2 x i32> %a1, <4 x i32> %b0, <4 x i32> %b1) nounwind readnone ssp {
+; CHECK-LABEL: noOpt1:
+  %vqsub2.i = tail call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %a0, <2 x i32> %a1) nounwind
+  %vaddhn2.i = tail call <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32> %b0, <4 x i32> %b1) nounwind
+; CHECK:	sqsub.2s	v0, v0, v1
+; CHECK-NEXT:	addhn2.8h	v0, v2, v3
+  %1 = bitcast <2 x i32> %vqsub2.i to <1 x i64>
+  %2 = bitcast <4 x i16> %vaddhn2.i to <1 x i64>
+  %shuffle.i = shufflevector <1 x i64> %1, <1 x i64> %2, <2 x i32> <i32 0, i32 1>
+  %3 = bitcast <2 x i64> %shuffle.i to <8 x i16>
+  ret <8 x i16> %3
+}
+
+declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
diff --git a/test/CodeGen/ARM64/vector-ext.ll b/test/CodeGen/ARM64/vector-ext.ll
new file mode 100644
index 0000000000..88889fdef3
--- /dev/null
+++ b/test/CodeGen/ARM64/vector-ext.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func30
+;CHECK: ushll.4s  v0, v0, #0
+;CHECK: movi.4s v1, #1
+;CHECK: and.16b v0, v0, v1
+;CHECK: str  q0, [x0]
+;CHECK: ret
+
+%T0_30 = type <4 x i1>
+%T1_30 = type <4 x i32>
+define void @func30(%T0_30 %v0, %T1_30* %p1) {
+  %r = zext %T0_30 %v0 to %T1_30
+  store %T1_30 %r, %T1_30* %p1
+  ret void
+}
diff --git a/test/CodeGen/ARM64/vector-imm.ll b/test/CodeGen/ARM64/vector-imm.ll
new file mode 100644
index 0000000000..f1fc3ccf84
--- /dev/null
+++ b/test/CodeGen/ARM64/vector-imm.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @v_orrimm(<8 x i8>* %A) nounwind {
+; CHECK-LABEL: v_orrimm:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: orr
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = or <8 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @v_orrimmQ(<16 x i8>* %A) nounwind {
+; CHECK: v_orrimmQ
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: orr
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = or <16 x i8> %tmp1, <i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 0, i8 1>
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i8> @v_bicimm(<8 x i8>* %A) nounwind {
+; CHECK-LABEL: v_bicimm:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: bic
+	%tmp1 = load <8 x i8>* %A
+	%tmp3 = and <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @v_bicimmQ(<16 x i8>* %A) nounwind {
+; CHECK-LABEL: v_bicimmQ:
+; CHECK-NOT: mov
+; CHECK-NOT: mvn
+; CHECK: bic
+	%tmp1 = load <16 x i8>* %A
+	%tmp3 = and <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 -1, i8 -1, i8 -1, i8 0 >
+	ret <16 x i8> %tmp3
+}
+
+define <2 x double> @foo(<2 x double> %bar) nounwind {
+; CHECK: foo
+; CHECK: fmov.2d	v1, #1.000000e+00
+  %add = fadd <2 x double> %bar, <double 1.0, double 1.0>
+  ret <2 x double> %add
+}
+
+define <4 x i32> @movi_4s_imm_t1() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t1:
+; CHECK: movi.4s v0, #75
+  ret <4 x i32> <i32 75, i32 75, i32 75, i32 75>
+}
+
+define <4 x i32> @movi_4s_imm_t2() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t2:
+; CHECK: movi.4s v0, #75, lsl #8
+  ret <4 x i32> <i32 19200, i32 19200, i32 19200, i32 19200>
+}
+
+define <4 x i32> @movi_4s_imm_t3() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t3:
+; CHECK: movi.4s v0, #75, lsl #16
+  ret <4 x i32> <i32 4915200, i32 4915200, i32 4915200, i32 4915200>
+}
+
+define <4 x i32> @movi_4s_imm_t4() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t4:
+; CHECK: movi.4s v0, #75, lsl #24
+  ret <4 x i32> <i32 1258291200, i32 1258291200, i32 1258291200, i32 1258291200>
+}
+
+define <8 x i16> @movi_8h_imm_t5() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_8h_imm_t5:
+; CHECK: movi.8h v0, #75
+  ret <8 x i16> <i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75, i16 75>
+}
+
+; rdar://11989841
+define <8 x i16> @movi_8h_imm_t6() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_8h_imm_t6:
+; CHECK: movi.8h v0, #75, lsl #8
+  ret <8 x i16> <i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200, i16 19200>
+}
+
+define <4 x i32> @movi_4s_imm_t7() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t7:
+; CHECK: movi.4s v0, #75, msl #8
+ret <4 x i32> <i32 19455, i32 19455, i32 19455, i32 19455>
+}
+
+define <4 x i32> @movi_4s_imm_t8() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t8:
+; CHECK: movi.4s v0, #75, msl #16
+ret <4 x i32> <i32 4980735, i32 4980735, i32 4980735, i32 4980735>
+}
+
+define <16 x i8> @movi_16b_imm_t9() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_16b_imm_t9:
+; CHECK: movi.16b v0, #75
+ret <16 x i8> <i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75,
+               i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75, i8 75>
+}
+
+define <2 x i64> @movi_2d_imm_t10() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_2d_imm_t10:
+; CHECK: movi.2d v0, #0xff00ff00ff00ff
+ret <2 x i64> <i64 71777214294589695, i64 71777214294589695>
+}
+
+define <4 x i32> @movi_4s_imm_t11() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_4s_imm_t11:
+; CHECK: fmov.4s v0, #-3.281250e-01
+ret <4 x i32> <i32 3198681088, i32 3198681088, i32 3198681088, i32 3198681088>
+}
+
+define <2 x i64> @movi_2d_imm_t12() nounwind readnone ssp {
+entry:
+; CHECK-LABEL: movi_2d_imm_t12:
+; CHECK: fmov.2d v0, #-1.718750e-01
+ret <2 x i64> <i64 13818732506632945664, i64 13818732506632945664>
+}
diff --git a/test/CodeGen/ARM64/vector-ldst.ll b/test/CodeGen/ARM64/vector-ldst.ll
new file mode 100644
index 0000000000..154160ee50
--- /dev/null
+++ b/test/CodeGen/ARM64/vector-ldst.ll
@@ -0,0 +1,601 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
+
+; rdar://9428579
+
+%type1 = type { <16 x i8> }
+%type2 = type { <8 x i8> }
+%type3 = type { <4 x i16> }
+
+
+define hidden fastcc void @t1(%type1** %argtable) nounwind {
+entry:
+; CHECK-LABEL: t1:
+; CHECK: ldr x[[REG:[0-9]+]], [x0]
+; CHECK: str q0, [x[[REG]]]
+  %tmp1 = load %type1** %argtable, align 8
+  %tmp2 = getelementptr inbounds %type1* %tmp1, i64 0, i32 0
+  store <16 x i8> zeroinitializer, <16 x i8>* %tmp2, align 16
+  ret void
+}
+
+define hidden fastcc void @t2(%type2** %argtable) nounwind {
+entry:
+; CHECK-LABEL: t2:
+; CHECK: ldr x[[REG:[0-9]+]], [x0]
+; CHECK: str d0, [x[[REG]]]
+  %tmp1 = load %type2** %argtable, align 8
+  %tmp2 = getelementptr inbounds %type2* %tmp1, i64 0, i32 0
+  store <8 x i8> zeroinitializer, <8 x i8>* %tmp2, align 8
+  ret void
+}
+
+; add a bunch of tests for rdar://11246289
+
+@globalArray64x2 = common global <2 x i64>* null, align 8
+@globalArray32x4 = common global <4 x i32>* null, align 8
+@globalArray16x8 = common global <8 x i16>* null, align 8
+@globalArray8x16 = common global <16 x i8>* null, align 8
+@globalArray64x1 = common global <1 x i64>* null, align 8
+@globalArray32x2 = common global <2 x i32>* null, align 8
+@globalArray16x4 = common global <4 x i16>* null, align 8
+@globalArray8x8 = common global <8 x i8>* null, align 8
+@floatglobalArray64x2 = common global <2 x double>* null, align 8
+@floatglobalArray32x4 = common global <4 x float>* null, align 8
+@floatglobalArray64x1 = common global <1 x double>* null, align 8
+@floatglobalArray32x2 = common global <2 x float>* null, align 8
+
+define void @fct1_64x2(<2 x i64>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_64x2:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 %offset
+  %tmp = load <2 x i64>* %arrayidx, align 16
+  %tmp1 = load <2 x i64>** @globalArray64x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 %offset
+  store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_64x2(<2 x i64>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_64x2:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <2 x i64>* %array, i64 3
+  %tmp = load <2 x i64>* %arrayidx, align 16
+  %tmp1 = load <2 x i64>** @globalArray64x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i64>* %tmp1, i64 5
+  store <2 x i64> %tmp, <2 x i64>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_32x4(<4 x i32>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_32x4:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 %offset
+  %tmp = load <4 x i32>* %arrayidx, align 16
+  %tmp1 = load <4 x i32>** @globalArray32x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 %offset
+  store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_32x4(<4 x i32>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_32x4:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <4 x i32>* %array, i64 3
+  %tmp = load <4 x i32>* %arrayidx, align 16
+  %tmp1 = load <4 x i32>** @globalArray32x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i32>* %tmp1, i64 5
+  store <4 x i32> %tmp, <4 x i32>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_16x8(<8 x i16>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_16x8:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 %offset
+  %tmp = load <8 x i16>* %arrayidx, align 16
+  %tmp1 = load <8 x i16>** @globalArray16x8, align 8
+  %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 %offset
+  store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_16x8(<8 x i16>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_16x8:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <8 x i16>* %array, i64 3
+  %tmp = load <8 x i16>* %arrayidx, align 16
+  %tmp1 = load <8 x i16>** @globalArray16x8, align 8
+  %arrayidx1 = getelementptr inbounds <8 x i16>* %tmp1, i64 5
+  store <8 x i16> %tmp, <8 x i16>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_8x16(<16 x i8>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_8x16:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #4
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 %offset
+  %tmp = load <16 x i8>* %arrayidx, align 16
+  %tmp1 = load <16 x i8>** @globalArray8x16, align 8
+  %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 %offset
+  store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct2_8x16(<16 x i8>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_8x16:
+; CHECK: ldr [[DEST:q[0-9]+]], [x0, #48]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #80]
+  %arrayidx = getelementptr inbounds <16 x i8>* %array, i64 3
+  %tmp = load <16 x i8>* %arrayidx, align 16
+  %tmp1 = load <16 x i8>** @globalArray8x16, align 8
+  %arrayidx1 = getelementptr inbounds <16 x i8>* %tmp1, i64 5
+  store <16 x i8> %tmp, <16 x i8>* %arrayidx1, align 16
+  ret void
+}
+
+define void @fct1_64x1(<1 x i64>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_64x1:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 %offset
+  %tmp = load <1 x i64>* %arrayidx, align 8
+  %tmp1 = load <1 x i64>** @globalArray64x1, align 8
+  %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 %offset
+  store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct2_64x1(<1 x i64>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_64x1:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+  %arrayidx = getelementptr inbounds <1 x i64>* %array, i64 3
+  %tmp = load <1 x i64>* %arrayidx, align 8
+  %tmp1 = load <1 x i64>** @globalArray64x1, align 8
+  %arrayidx1 = getelementptr inbounds <1 x i64>* %tmp1, i64 5
+  store <1 x i64> %tmp, <1 x i64>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct1_32x2(<2 x i32>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_32x2:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 %offset
+  %tmp = load <2 x i32>* %arrayidx, align 8
+  %tmp1 = load <2 x i32>** @globalArray32x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 %offset
+  store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct2_32x2(<2 x i32>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_32x2:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+  %arrayidx = getelementptr inbounds <2 x i32>* %array, i64 3
+  %tmp = load <2 x i32>* %arrayidx, align 8
+  %tmp1 = load <2 x i32>** @globalArray32x2, align 8
+  %arrayidx1 = getelementptr inbounds <2 x i32>* %tmp1, i64 5
+  store <2 x i32> %tmp, <2 x i32>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct1_16x4(<4 x i16>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_16x4:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 %offset
+  %tmp = load <4 x i16>* %arrayidx, align 8
+  %tmp1 = load <4 x i16>** @globalArray16x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 %offset
+  store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct2_16x4(<4 x i16>* nocapture %array) nounwind ssp {
+entry:
+; CHECK-LABEL: fct2_16x4:
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, #24]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], #40]
+  %arrayidx = getelementptr inbounds <4 x i16>* %array, i64 3
+  %tmp = load <4 x i16>* %arrayidx, align 8
+  %tmp1 = load <4 x i16>** @globalArray16x4, align 8
+  %arrayidx1 = getelementptr inbounds <4 x i16>* %tmp1, i64 5
+  store <4 x i16> %tmp, <4 x i16>* %arrayidx1, align 8
+  ret void
+}
+
+define void @fct1_8x8(<8 x i8>* nocapture %array, i64 %offset) nounwind ssp {
+entry:
+; CHECK-LABEL: fct1_8x8:
+; CHECK: lsl [[SHIFTEDOFFSET:x[0-9]+]], x1, #3
+; CHECK: ldr [[DEST:d[0-9]+]], [x0, [[SHIFTEDOFFSET]]]
+; CHECK: ldr [[BASE:x[0-9]+]],
+; CHECK: str [[DEST]], {{\[}}[[BASE]], [[SHIFTEDOFFSET]]]
+  %arrayidx = getelementptr inbounds <8 x i8>* %array, i64 %offset
+  %tmp = load <8 x i8>* %arrayidx, align 8
+  %tmp1 = load <8 x i8>** @globalArray8x8, align 8
+  %arrayidx1 = getelementptr inbounds <8 x i8>* %tmp1, i64 %offset
+  store <8 x i8> %tmp, <8 x i8>* %arrayidx1, align 8
+  ret void
+}
+
+; Add a bunch of tests for rdar://13258794: Match LDUR/STUR for D and Q
+; registers for unscaled vector accesses
+@str = global [63 x i8] c"Test case for rdar://13258794: LDUR/STUR for D and Q registers\00", align 1
+
+define <1 x i64> @fct0() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct0:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
+  ret <1 x i64> %0
+}
+
+define <2 x i32> @fct1() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct1:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
+  ret <2 x i32> %0
+}
+
+define <4 x i16> @fct2() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct2:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
+  ret <4 x i16> %0
+}
+
+define <8 x i8> @fct3() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct3:
+; CHECK: ldur {{d[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
+  ret <8 x i8> %0
+}
+
+define <2 x i64> @fct4() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct4:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
+  ret <2 x i64> %0
+}
+
+define <4 x i32> @fct5() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct5:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
+  ret <4 x i32> %0
+}
+
+define <8 x i16> @fct6() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct6:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
+  ret <8 x i16> %0
+}
+
+define <16 x i8> @fct7() nounwind readonly ssp {
+entry:
+; CHECK-LABEL: fct7:
+; CHECK: ldur {{q[0-9]+}}, [{{x[0-9]+}}, #3]
+  %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
+  ret <16 x i8> %0
+}
+
+define void @fct8() nounwind ssp {
+entry:
+; CHECK-LABEL: fct8:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <1 x i64>*), align 8
+  store <1 x i64> %0, <1 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <1 x i64>*), align 8
+  ret void
+}
+
+define void @fct9() nounwind ssp {
+entry:
+; CHECK-LABEL: fct9:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i32>*), align 8
+  store <2 x i32> %0, <2 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i32>*), align 8
+  ret void
+}
+
+define void @fct10() nounwind ssp {
+entry:
+; CHECK-LABEL: fct10:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i16>*), align 8
+  store <4 x i16> %0, <4 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i16>*), align 8
+  ret void
+}
+
+define void @fct11() nounwind ssp {
+entry:
+; CHECK-LABEL: fct11:
+; CHECK: ldur [[DESTREG:d[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i8>*), align 8
+  store <8 x i8> %0, <8 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i8>*), align 8
+  ret void
+}
+
+define void @fct12() nounwind ssp {
+entry:
+; CHECK-LABEL: fct12:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <2 x i64>*), align 16
+  store <2 x i64> %0, <2 x i64>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <2 x i64>*), align 16
+  ret void
+}
+
+define void @fct13() nounwind ssp {
+entry:
+; CHECK-LABEL: fct13:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <4 x i32>*), align 16
+  store <4 x i32> %0, <4 x i32>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <4 x i32>*), align 16
+  ret void
+}
+
+define void @fct14() nounwind ssp {
+entry:
+; CHECK-LABEL: fct14:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <8 x i16>*), align 16
+  store <8 x i16> %0, <8 x i16>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <8 x i16>*), align 16
+  ret void
+}
+
+define void @fct15() nounwind ssp {
+entry:
+; CHECK-LABEL: fct15:
+; CHECK: ldur [[DESTREG:q[0-9]+]], {{\[}}[[BASEREG:x[0-9]+]], #3]
+; CHECK: stur [[DESTREG]], {{\[}}[[BASEREG]], #4]
+  %0 = load <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 3) to <16 x i8>*), align 16
+  store <16 x i8> %0, <16 x i8>* bitcast (i8* getelementptr inbounds ([63 x i8]* @str, i64 0, i64 4) to <16 x i8>*), align 16
+  ret void
+}
+
+; Check the building of vector from a single loaded value.
+; Part of <rdar://problem/14170854>
+;
+; Single loads with immediate offset.
+define <8 x i8> @fct16(i8* nocapture %sp0) {
+; CHECK-LABEL: fct16:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i8> %vec, %vec
+  ret <8 x i8> %vmull.i
+}
+
+define <16 x i8> @fct17(i8* nocapture %sp0) {
+; CHECK-LABEL: fct17:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, #1]
+; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 1
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <16 x i8> %vec, %vec
+  ret <16 x i8> %vmull.i
+}
+
+define <4 x i16> @fct18(i16* nocapture %sp0) {
+; CHECK-LABEL: fct18:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i16> %vec, %vec
+  ret <4 x i16> %vmull.i
+}
+
+define <8 x i16> @fct19(i16* nocapture %sp0) {
+; CHECK-LABEL: fct19:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, #2]
+; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 1
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i16> %vec, %vec
+  ret <8 x i16> %vmull.i
+}
+
+define <2 x i32> @fct20(i32* nocapture %sp0) {
+; CHECK-LABEL: fct20:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <2 x i32> %vec, %vec
+  ret <2 x i32> %vmull.i
+}
+
+define <4 x i32> @fct21(i32* nocapture %sp0) {
+; CHECK-LABEL: fct21:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, #4]
+; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 1
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i32> %vec, %vec
+  ret <4 x i32> %vmull.i
+}
+
+define <1 x i64> @fct22(i64* nocapture %sp0) {
+; CHECK-LABEL: fct22:
+; CHECK: ldr d0, [x0, #8]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+   ret <1 x i64> %vec
+}
+
+define <2 x i64> @fct23(i64* nocapture %sp0) {
+; CHECK-LABEL: fct23:
+; CHECK: ldr d[[REGNUM:[0-9]+]], [x0, #8]
+entry:
+  %addr = getelementptr i64* %sp0, i64 1
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+  ret <2 x i64> %vec
+}
+
+;
+; Single loads with register offset.
+define <8 x i8> @fct24(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct24:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: mul.8b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <8 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i8> %vec, %vec
+  ret <8 x i8> %vmull.i
+}
+
+define <16 x i8> @fct25(i8* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct25:
+; CHECK: ldr b[[REGNUM:[0-9]+]], [x0, x1]
+; CHECK-NEXT: mul.16b v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i8* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i8* %addr, align 1
+  %vec = insertelement <16 x i8> undef, i8 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <16 x i8> %vec, %vec
+  ret <16 x i8> %vmull.i
+}
+
+define <4 x i16> @fct26(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct26:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: mul.4h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <4 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i16> %vec, %vec
+  ret <4 x i16> %vmull.i
+}
+
+define <8 x i16> @fct27(i16* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct27:
+; CHECK: ldr h[[REGNUM:[0-9]+]], [x0, x1, lsl #1]
+; CHECK-NEXT: mul.8h v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i16* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i16* %addr, align 1
+  %vec = insertelement <8 x i16> undef, i16 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <8 x i16> %vec, %vec
+  ret <8 x i16> %vmull.i
+}
+
+define <2 x i32> @fct28(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct28:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: mul.2s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <2 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <2 x i32> %vec, %vec
+  ret <2 x i32> %vmull.i
+}
+
+define <4 x i32> @fct29(i32* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct29:
+; CHECK: ldr s[[REGNUM:[0-9]+]], [x0, x1, lsl #2]
+; CHECK-NEXT: mul.4s v0, v[[REGNUM]], v[[REGNUM]]
+entry:
+  %addr = getelementptr i32* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i32* %addr, align 1
+  %vec = insertelement <4 x i32> undef, i32 %pix_sp0.0.copyload, i32 0
+  %vmull.i = mul <4 x i32> %vec, %vec
+  ret <4 x i32> %vmull.i
+}
+
+define <1 x i64> @fct30(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct30:
+; CHECK: ldr d0, [x0, x1, lsl #3]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <1 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+   ret <1 x i64> %vec
+}
+
+define <2 x i64> @fct31(i64* nocapture %sp0, i64 %offset) {
+; CHECK-LABEL: fct31:
+; CHECK: ldr d0, [x0, x1, lsl #3]
+entry:
+  %addr = getelementptr i64* %sp0, i64 %offset
+  %pix_sp0.0.copyload = load i64* %addr, align 1
+  %vec = insertelement <2 x i64> undef, i64 %pix_sp0.0.copyload, i32 0
+  ret <2 x i64> %vec
+}
diff --git a/test/CodeGen/ARM64/vext.ll b/test/CodeGen/ARM64/vext.ll
new file mode 100644
index 0000000000..c82043940c
--- /dev/null
+++ b/test/CodeGen/ARM64/vext.ll
@@ -0,0 +1,464 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define void @test_vext_s8() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s8:
+  ; CHECK: {{ext.8.*#1}}
+  %xS8x8 = alloca <8 x i8>, align 8
+  %__a = alloca <8 x i8>, align 8
+  %__b = alloca <8 x i8>, align 8
+  %tmp = load <8 x i8>* %xS8x8, align 8
+  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+  %tmp1 = load <8 x i8>* %xS8x8, align 8
+  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+  %tmp2 = load <8 x i8>* %__a, align 8
+  %tmp3 = load <8 x i8>* %__b, align 8
+  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  store <8 x i8> %vext, <8 x i8>* %xS8x8, align 8
+  ret void
+}
+
+define void @test_vext_u8() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u8:
+  ; CHECK: {{ext.8.*#2}}
+  %xU8x8 = alloca <8 x i8>, align 8
+  %__a = alloca <8 x i8>, align 8
+  %__b = alloca <8 x i8>, align 8
+  %tmp = load <8 x i8>* %xU8x8, align 8
+  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+  %tmp1 = load <8 x i8>* %xU8x8, align 8
+  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+  %tmp2 = load <8 x i8>* %__a, align 8
+  %tmp3 = load <8 x i8>* %__b, align 8
+  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
+  store <8 x i8> %vext, <8 x i8>* %xU8x8, align 8
+  ret void
+}
+
+define void @test_vext_p8() nounwind ssp {
+  ; CHECK-LABEL: test_vext_p8:
+  ; CHECK: {{ext.8.*#3}}
+  %xP8x8 = alloca <8 x i8>, align 8
+  %__a = alloca <8 x i8>, align 8
+  %__b = alloca <8 x i8>, align 8
+  %tmp = load <8 x i8>* %xP8x8, align 8
+  store <8 x i8> %tmp, <8 x i8>* %__a, align 8
+  %tmp1 = load <8 x i8>* %xP8x8, align 8
+  store <8 x i8> %tmp1, <8 x i8>* %__b, align 8
+  %tmp2 = load <8 x i8>* %__a, align 8
+  %tmp3 = load <8 x i8>* %__b, align 8
+  %vext = shufflevector <8 x i8> %tmp2, <8 x i8> %tmp3, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+  store <8 x i8> %vext, <8 x i8>* %xP8x8, align 8
+  ret void
+}
+
+define void @test_vext_s16() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s16:
+  ; CHECK: {{ext.8.*#2}}
+  %xS16x4 = alloca <4 x i16>, align 8
+  %__a = alloca <4 x i16>, align 8
+  %__b = alloca <4 x i16>, align 8
+  %tmp = load <4 x i16>* %xS16x4, align 8
+  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+  %tmp1 = load <4 x i16>* %xS16x4, align 8
+  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+  %tmp2 = load <4 x i16>* %__a, align 8
+  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+  %tmp4 = load <4 x i16>* %__b, align 8
+  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  store <4 x i16> %vext, <4 x i16>* %xS16x4, align 8
+  ret void
+}
+
+define void @test_vext_u16() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u16:
+  ; CHECK: {{ext.8.*#4}}
+  %xU16x4 = alloca <4 x i16>, align 8
+  %__a = alloca <4 x i16>, align 8
+  %__b = alloca <4 x i16>, align 8
+  %tmp = load <4 x i16>* %xU16x4, align 8
+  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+  %tmp1 = load <4 x i16>* %xU16x4, align 8
+  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+  %tmp2 = load <4 x i16>* %__a, align 8
+  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+  %tmp4 = load <4 x i16>* %__b, align 8
+  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  store <4 x i16> %vext, <4 x i16>* %xU16x4, align 8
+  ret void
+}
+
+define void @test_vext_p16() nounwind ssp {
+  ; CHECK-LABEL: test_vext_p16:
+  ; CHECK: {{ext.8.*#6}}
+  %xP16x4 = alloca <4 x i16>, align 8
+  %__a = alloca <4 x i16>, align 8
+  %__b = alloca <4 x i16>, align 8
+  %tmp = load <4 x i16>* %xP16x4, align 8
+  store <4 x i16> %tmp, <4 x i16>* %__a, align 8
+  %tmp1 = load <4 x i16>* %xP16x4, align 8
+  store <4 x i16> %tmp1, <4 x i16>* %__b, align 8
+  %tmp2 = load <4 x i16>* %__a, align 8
+  %tmp3 = bitcast <4 x i16> %tmp2 to <8 x i8>
+  %tmp4 = load <4 x i16>* %__b, align 8
+  %tmp5 = bitcast <4 x i16> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <4 x i16>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <4 x i16>
+  %vext = shufflevector <4 x i16> %tmp6, <4 x i16> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  store <4 x i16> %vext, <4 x i16>* %xP16x4, align 8
+  ret void
+}
+
+define void @test_vext_s32() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s32:
+  ; CHECK: {{ext.8.*#4}}
+  %xS32x2 = alloca <2 x i32>, align 8
+  %__a = alloca <2 x i32>, align 8
+  %__b = alloca <2 x i32>, align 8
+  %tmp = load <2 x i32>* %xS32x2, align 8
+  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
+  %tmp1 = load <2 x i32>* %xS32x2, align 8
+  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
+  %tmp2 = load <2 x i32>* %__a, align 8
+  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
+  %tmp4 = load <2 x i32>* %__b, align 8
+  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
+  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i32> %vext, <2 x i32>* %xS32x2, align 8
+  ret void
+}
+
+define void @test_vext_u32() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u32:
+  ; CHECK: {{ext.8.*#4}}
+  %xU32x2 = alloca <2 x i32>, align 8
+  %__a = alloca <2 x i32>, align 8
+  %__b = alloca <2 x i32>, align 8
+  %tmp = load <2 x i32>* %xU32x2, align 8
+  store <2 x i32> %tmp, <2 x i32>* %__a, align 8
+  %tmp1 = load <2 x i32>* %xU32x2, align 8
+  store <2 x i32> %tmp1, <2 x i32>* %__b, align 8
+  %tmp2 = load <2 x i32>* %__a, align 8
+  %tmp3 = bitcast <2 x i32> %tmp2 to <8 x i8>
+  %tmp4 = load <2 x i32>* %__b, align 8
+  %tmp5 = bitcast <2 x i32> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x i32>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x i32>
+  %vext = shufflevector <2 x i32> %tmp6, <2 x i32> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i32> %vext, <2 x i32>* %xU32x2, align 8
+  ret void
+}
+
+define void @test_vext_f32() nounwind ssp {
+  ; CHECK-LABEL: test_vext_f32:
+  ; CHECK: {{ext.8.*#4}}
+  %xF32x2 = alloca <2 x float>, align 8
+  %__a = alloca <2 x float>, align 8
+  %__b = alloca <2 x float>, align 8
+  %tmp = load <2 x float>* %xF32x2, align 8
+  store <2 x float> %tmp, <2 x float>* %__a, align 8
+  %tmp1 = load <2 x float>* %xF32x2, align 8
+  store <2 x float> %tmp1, <2 x float>* %__b, align 8
+  %tmp2 = load <2 x float>* %__a, align 8
+  %tmp3 = bitcast <2 x float> %tmp2 to <8 x i8>
+  %tmp4 = load <2 x float>* %__b, align 8
+  %tmp5 = bitcast <2 x float> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <2 x float>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <2 x float>
+  %vext = shufflevector <2 x float> %tmp6, <2 x float> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x float> %vext, <2 x float>* %xF32x2, align 8
+  ret void
+}
+
+define void @test_vext_s64() nounwind ssp {
+  ; CHECK-LABEL: test_vext_s64:
+  ; CHECK_FIXME: {{ext.8.*#1}}
+  ; this just turns into a load of the second element
+  %xS64x1 = alloca <1 x i64>, align 8
+  %__a = alloca <1 x i64>, align 8
+  %__b = alloca <1 x i64>, align 8
+  %tmp = load <1 x i64>* %xS64x1, align 8
+  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
+  %tmp1 = load <1 x i64>* %xS64x1, align 8
+  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
+  %tmp2 = load <1 x i64>* %__a, align 8
+  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
+  %tmp4 = load <1 x i64>* %__b, align 8
+  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
+  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
+  store <1 x i64> %vext, <1 x i64>* %xS64x1, align 8
+  ret void
+}
+
+define void @test_vext_u64() nounwind ssp {
+  ; CHECK-LABEL: test_vext_u64:
+  ; CHECK_FIXME: {{ext.8.*#1}}
+  ; this is turned into a simple load of the 2nd element
+  %xU64x1 = alloca <1 x i64>, align 8
+  %__a = alloca <1 x i64>, align 8
+  %__b = alloca <1 x i64>, align 8
+  %tmp = load <1 x i64>* %xU64x1, align 8
+  store <1 x i64> %tmp, <1 x i64>* %__a, align 8
+  %tmp1 = load <1 x i64>* %xU64x1, align 8
+  store <1 x i64> %tmp1, <1 x i64>* %__b, align 8
+  %tmp2 = load <1 x i64>* %__a, align 8
+  %tmp3 = bitcast <1 x i64> %tmp2 to <8 x i8>
+  %tmp4 = load <1 x i64>* %__b, align 8
+  %tmp5 = bitcast <1 x i64> %tmp4 to <8 x i8>
+  %tmp6 = bitcast <8 x i8> %tmp3 to <1 x i64>
+  %tmp7 = bitcast <8 x i8> %tmp5 to <1 x i64>
+  %vext = shufflevector <1 x i64> %tmp6, <1 x i64> %tmp7, <1 x i32> <i32 1>
+  store <1 x i64> %vext, <1 x i64>* %xU64x1, align 8
+  ret void
+}
+
+define void @test_vextq_s8() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s8:
+  ; CHECK: {{ext.16.*#4}}
+  %xS8x16 = alloca <16 x i8>, align 16
+  %__a = alloca <16 x i8>, align 16
+  %__b = alloca <16 x i8>, align 16
+  %tmp = load <16 x i8>* %xS8x16, align 16
+  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+  %tmp1 = load <16 x i8>* %xS8x16, align 16
+  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+  %tmp2 = load <16 x i8>* %__a, align 16
+  %tmp3 = load <16 x i8>* %__b, align 16
+  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19>
+  store <16 x i8> %vext, <16 x i8>* %xS8x16, align 16
+  ret void
+}
+
+define void @test_vextq_u8() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u8:
+  ; CHECK: {{ext.16.*#5}}
+  %xU8x16 = alloca <16 x i8>, align 16
+  %__a = alloca <16 x i8>, align 16
+  %__b = alloca <16 x i8>, align 16
+  %tmp = load <16 x i8>* %xU8x16, align 16
+  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+  %tmp1 = load <16 x i8>* %xU8x16, align 16
+  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+  %tmp2 = load <16 x i8>* %__a, align 16
+  %tmp3 = load <16 x i8>* %__b, align 16
+  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
+  store <16 x i8> %vext, <16 x i8>* %xU8x16, align 16
+  ret void
+}
+
+define void @test_vextq_p8() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_p8:
+  ; CHECK: {{ext.16.*#6}}
+  %xP8x16 = alloca <16 x i8>, align 16
+  %__a = alloca <16 x i8>, align 16
+  %__b = alloca <16 x i8>, align 16
+  %tmp = load <16 x i8>* %xP8x16, align 16
+  store <16 x i8> %tmp, <16 x i8>* %__a, align 16
+  %tmp1 = load <16 x i8>* %xP8x16, align 16
+  store <16 x i8> %tmp1, <16 x i8>* %__b, align 16
+  %tmp2 = load <16 x i8>* %__a, align 16
+  %tmp3 = load <16 x i8>* %__b, align 16
+  %vext = shufflevector <16 x i8> %tmp2, <16 x i8> %tmp3, <16 x i32> <i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21>
+  store <16 x i8> %vext, <16 x i8>* %xP8x16, align 16
+  ret void
+}
+
+define void @test_vextq_s16() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s16:
+  ; CHECK: {{ext.16.*#14}}
+  %xS16x8 = alloca <8 x i16>, align 16
+  %__a = alloca <8 x i16>, align 16
+  %__b = alloca <8 x i16>, align 16
+  %tmp = load <8 x i16>* %xS16x8, align 16
+  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+  %tmp1 = load <8 x i16>* %xS16x8, align 16
+  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+  %tmp2 = load <8 x i16>* %__a, align 16
+  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+  %tmp4 = load <8 x i16>* %__b, align 16
+  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  store <8 x i16> %vext, <8 x i16>* %xS16x8, align 16
+  ret void
+}
+
+define void @test_vextq_u16() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u16:
+  ; CHECK: {{ext.16.*#8}}
+  %xU16x8 = alloca <8 x i16>, align 16
+  %__a = alloca <8 x i16>, align 16
+  %__b = alloca <8 x i16>, align 16
+  %tmp = load <8 x i16>* %xU16x8, align 16
+  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+  %tmp1 = load <8 x i16>* %xU16x8, align 16
+  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+  %tmp2 = load <8 x i16>* %__a, align 16
+  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+  %tmp4 = load <8 x i16>* %__b, align 16
+  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+  store <8 x i16> %vext, <8 x i16>* %xU16x8, align 16
+  ret void
+}
+
+define void @test_vextq_p16() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_p16:
+  ; CHECK: {{ext.16.*#10}}
+  %xP16x8 = alloca <8 x i16>, align 16
+  %__a = alloca <8 x i16>, align 16
+  %__b = alloca <8 x i16>, align 16
+  %tmp = load <8 x i16>* %xP16x8, align 16
+  store <8 x i16> %tmp, <8 x i16>* %__a, align 16
+  %tmp1 = load <8 x i16>* %xP16x8, align 16
+  store <8 x i16> %tmp1, <8 x i16>* %__b, align 16
+  %tmp2 = load <8 x i16>* %__a, align 16
+  %tmp3 = bitcast <8 x i16> %tmp2 to <16 x i8>
+  %tmp4 = load <8 x i16>* %__b, align 16
+  %tmp5 = bitcast <8 x i16> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <8 x i16>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <8 x i16>
+  %vext = shufflevector <8 x i16> %tmp6, <8 x i16> %tmp7, <8 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12>
+  store <8 x i16> %vext, <8 x i16>* %xP16x8, align 16
+  ret void
+}
+
+define void @test_vextq_s32() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s32:
+  ; CHECK: {{ext.16.*#4}}
+  %xS32x4 = alloca <4 x i32>, align 16
+  %__a = alloca <4 x i32>, align 16
+  %__b = alloca <4 x i32>, align 16
+  %tmp = load <4 x i32>* %xS32x4, align 16
+  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
+  %tmp1 = load <4 x i32>* %xS32x4, align 16
+  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
+  %tmp2 = load <4 x i32>* %__a, align 16
+  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+  %tmp4 = load <4 x i32>* %__b, align 16
+  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
+  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  store <4 x i32> %vext, <4 x i32>* %xS32x4, align 16
+  ret void
+}
+
+define void @test_vextq_u32() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u32:
+  ; CHECK: {{ext.16.*#8}}
+  %xU32x4 = alloca <4 x i32>, align 16
+  %__a = alloca <4 x i32>, align 16
+  %__b = alloca <4 x i32>, align 16
+  %tmp = load <4 x i32>* %xU32x4, align 16
+  store <4 x i32> %tmp, <4 x i32>* %__a, align 16
+  %tmp1 = load <4 x i32>* %xU32x4, align 16
+  store <4 x i32> %tmp1, <4 x i32>* %__b, align 16
+  %tmp2 = load <4 x i32>* %__a, align 16
+  %tmp3 = bitcast <4 x i32> %tmp2 to <16 x i8>
+  %tmp4 = load <4 x i32>* %__b, align 16
+  %tmp5 = bitcast <4 x i32> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x i32>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x i32>
+  %vext = shufflevector <4 x i32> %tmp6, <4 x i32> %tmp7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  store <4 x i32> %vext, <4 x i32>* %xU32x4, align 16
+  ret void
+}
+
+define void @test_vextq_f32() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_f32:
+  ; CHECK: {{ext.16.*#12}}
+  %xF32x4 = alloca <4 x float>, align 16
+  %__a = alloca <4 x float>, align 16
+  %__b = alloca <4 x float>, align 16
+  %tmp = load <4 x float>* %xF32x4, align 16
+  store <4 x float> %tmp, <4 x float>* %__a, align 16
+  %tmp1 = load <4 x float>* %xF32x4, align 16
+  store <4 x float> %tmp1, <4 x float>* %__b, align 16
+  %tmp2 = load <4 x float>* %__a, align 16
+  %tmp3 = bitcast <4 x float> %tmp2 to <16 x i8>
+  %tmp4 = load <4 x float>* %__b, align 16
+  %tmp5 = bitcast <4 x float> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <4 x float>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <4 x float>
+  %vext = shufflevector <4 x float> %tmp6, <4 x float> %tmp7, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+  store <4 x float> %vext, <4 x float>* %xF32x4, align 16
+  ret void
+}
+
+define void @test_vextq_s64() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_s64:
+  ; CHECK: {{ext.16.*#8}}
+  %xS64x2 = alloca <2 x i64>, align 16
+  %__a = alloca <2 x i64>, align 16
+  %__b = alloca <2 x i64>, align 16
+  %tmp = load <2 x i64>* %xS64x2, align 16
+  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
+  %tmp1 = load <2 x i64>* %xS64x2, align 16
+  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
+  %tmp2 = load <2 x i64>* %__a, align 16
+  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
+  %tmp4 = load <2 x i64>* %__b, align 16
+  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
+  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i64> %vext, <2 x i64>* %xS64x2, align 16
+  ret void
+}
+
+define void @test_vextq_u64() nounwind ssp {
+  ; CHECK-LABEL: test_vextq_u64:
+  ; CHECK: {{ext.16.*#8}}
+  %xU64x2 = alloca <2 x i64>, align 16
+  %__a = alloca <2 x i64>, align 16
+  %__b = alloca <2 x i64>, align 16
+  %tmp = load <2 x i64>* %xU64x2, align 16
+  store <2 x i64> %tmp, <2 x i64>* %__a, align 16
+  %tmp1 = load <2 x i64>* %xU64x2, align 16
+  store <2 x i64> %tmp1, <2 x i64>* %__b, align 16
+  %tmp2 = load <2 x i64>* %__a, align 16
+  %tmp3 = bitcast <2 x i64> %tmp2 to <16 x i8>
+  %tmp4 = load <2 x i64>* %__b, align 16
+  %tmp5 = bitcast <2 x i64> %tmp4 to <16 x i8>
+  %tmp6 = bitcast <16 x i8> %tmp3 to <2 x i64>
+  %tmp7 = bitcast <16 x i8> %tmp5 to <2 x i64>
+  %vext = shufflevector <2 x i64> %tmp6, <2 x i64> %tmp7, <2 x i32> <i32 1, i32 2>
+  store <2 x i64> %vext, <2 x i64>* %xU64x2, align 16
+  ret void
+}
+
+; shuffles with an undef second operand can use an EXT also so long as the
+; indices wrap and stay sequential.
+; rdar://12051674
+define <16 x i8> @vext1(<16 x i8> %_a) nounwind {
+; CHECK-LABEL: vext1:
+; CHECK: ext.16b  v0, v0, v0, #8
+  %vext = shufflevector <16 x i8> %_a, <16 x i8> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <16 x i8> %vext
+}
+
+; <rdar://problem/12212062>
+define <2 x i64> @vext2(<2 x i64> %p0, <2 x i64> %p1) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vext2:
+; CHECK: ext.16b v1, v1, v1, #8
+; CHECK: ext.16b v0, v0, v0, #8
+; CHECK: add.2d  v0, v0, v1
+  %t0 = shufflevector <2 x i64> %p1, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  %t1 = shufflevector <2 x i64> %p0, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  %t2 = add <2 x i64> %t1, %t0
+  ret <2 x i64> %t2
+}
diff --git a/test/CodeGen/ARM64/vfloatintrinsics.ll b/test/CodeGen/ARM64/vfloatintrinsics.ll
new file mode 100644
index 0000000000..a8c882bf69
--- /dev/null
+++ b/test/CodeGen/ARM64/vfloatintrinsics.ll
@@ -0,0 +1,375 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+;;; Float vectors
+
+%v2f32 = type <2 x float>
+; CHECK: test_v2f32.sqrt:
+define %v2f32 @test_v2f32.sqrt(%v2f32 %a) {
+  ; CHECK: fsqrt.2s
+  %1 = call %v2f32 @llvm.sqrt.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.powi:
+define %v2f32 @test_v2f32.powi(%v2f32 %a, i32 %b) {
+  ; CHECK: pow
+  %1 = call %v2f32 @llvm.powi.v2f32(%v2f32 %a, i32 %b)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.sin:
+define %v2f32 @test_v2f32.sin(%v2f32 %a) {
+  ; CHECK: sin
+  %1 = call %v2f32 @llvm.sin.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.cos:
+define %v2f32 @test_v2f32.cos(%v2f32 %a) {
+  ; CHECK: cos
+  %1 = call %v2f32 @llvm.cos.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.pow:
+define %v2f32 @test_v2f32.pow(%v2f32 %a, %v2f32 %b) {
+  ; CHECK: pow
+  %1 = call %v2f32 @llvm.pow.v2f32(%v2f32 %a, %v2f32 %b)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.exp:
+define %v2f32 @test_v2f32.exp(%v2f32 %a) {
+  ; CHECK: exp
+  %1 = call %v2f32 @llvm.exp.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.exp2:
+define %v2f32 @test_v2f32.exp2(%v2f32 %a) {
+  ; CHECK: exp
+  %1 = call %v2f32 @llvm.exp2.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.log:
+define %v2f32 @test_v2f32.log(%v2f32 %a) {
+  ; CHECK: log
+  %1 = call %v2f32 @llvm.log.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.log10:
+define %v2f32 @test_v2f32.log10(%v2f32 %a) {
+  ; CHECK: log
+  %1 = call %v2f32 @llvm.log10.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.log2:
+define %v2f32 @test_v2f32.log2(%v2f32 %a) {
+  ; CHECK: log
+  %1 = call %v2f32 @llvm.log2.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.fma:
+define %v2f32 @test_v2f32.fma(%v2f32 %a, %v2f32 %b, %v2f32 %c) {
+  ; CHECK: fma
+  %1 = call %v2f32 @llvm.fma.v2f32(%v2f32 %a, %v2f32 %b, %v2f32 %c)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.fabs:
+define %v2f32 @test_v2f32.fabs(%v2f32 %a) {
+  ; CHECK: fabs
+  %1 = call %v2f32 @llvm.fabs.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.floor:
+define %v2f32 @test_v2f32.floor(%v2f32 %a) {
+  ; CHECK: frintm.2s
+  %1 = call %v2f32 @llvm.floor.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.ceil:
+define %v2f32 @test_v2f32.ceil(%v2f32 %a) {
+  ; CHECK: frintp.2s
+  %1 = call %v2f32 @llvm.ceil.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.trunc:
+define %v2f32 @test_v2f32.trunc(%v2f32 %a) {
+  ; CHECK: frintz.2s
+  %1 = call %v2f32 @llvm.trunc.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.rint:
+define %v2f32 @test_v2f32.rint(%v2f32 %a) {
+  ; CHECK: frintx.2s
+  %1 = call %v2f32 @llvm.rint.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+; CHECK: test_v2f32.nearbyint:
+define %v2f32 @test_v2f32.nearbyint(%v2f32 %a) {
+  ; CHECK: frinti.2s
+  %1 = call %v2f32 @llvm.nearbyint.v2f32(%v2f32 %a)
+  ret %v2f32 %1
+}
+
+declare %v2f32 @llvm.sqrt.v2f32(%v2f32) #0
+declare %v2f32 @llvm.powi.v2f32(%v2f32, i32) #0
+declare %v2f32 @llvm.sin.v2f32(%v2f32) #0
+declare %v2f32 @llvm.cos.v2f32(%v2f32) #0
+declare %v2f32 @llvm.pow.v2f32(%v2f32, %v2f32) #0
+declare %v2f32 @llvm.exp.v2f32(%v2f32) #0
+declare %v2f32 @llvm.exp2.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log10.v2f32(%v2f32) #0
+declare %v2f32 @llvm.log2.v2f32(%v2f32) #0
+declare %v2f32 @llvm.fma.v2f32(%v2f32, %v2f32, %v2f32) #0
+declare %v2f32 @llvm.fabs.v2f32(%v2f32) #0
+declare %v2f32 @llvm.floor.v2f32(%v2f32) #0
+declare %v2f32 @llvm.ceil.v2f32(%v2f32) #0
+declare %v2f32 @llvm.trunc.v2f32(%v2f32) #0
+declare %v2f32 @llvm.rint.v2f32(%v2f32) #0
+declare %v2f32 @llvm.nearbyint.v2f32(%v2f32) #0
+
+;;;
+
+%v4f32 = type <4 x float>
+; CHECK: test_v4f32.sqrt:
+define %v4f32 @test_v4f32.sqrt(%v4f32 %a) {
+  ; CHECK: fsqrt.4s
+  %1 = call %v4f32 @llvm.sqrt.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.powi:
+define %v4f32 @test_v4f32.powi(%v4f32 %a, i32 %b) {
+  ; CHECK: pow
+  %1 = call %v4f32 @llvm.powi.v4f32(%v4f32 %a, i32 %b)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.sin:
+define %v4f32 @test_v4f32.sin(%v4f32 %a) {
+  ; CHECK: sin
+  %1 = call %v4f32 @llvm.sin.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.cos:
+define %v4f32 @test_v4f32.cos(%v4f32 %a) {
+  ; CHECK: cos
+  %1 = call %v4f32 @llvm.cos.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.pow:
+define %v4f32 @test_v4f32.pow(%v4f32 %a, %v4f32 %b) {
+  ; CHECK: pow
+  %1 = call %v4f32 @llvm.pow.v4f32(%v4f32 %a, %v4f32 %b)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.exp:
+define %v4f32 @test_v4f32.exp(%v4f32 %a) {
+  ; CHECK: exp
+  %1 = call %v4f32 @llvm.exp.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.exp2:
+define %v4f32 @test_v4f32.exp2(%v4f32 %a) {
+  ; CHECK: exp
+  %1 = call %v4f32 @llvm.exp2.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.log:
+define %v4f32 @test_v4f32.log(%v4f32 %a) {
+  ; CHECK: log
+  %1 = call %v4f32 @llvm.log.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.log10:
+define %v4f32 @test_v4f32.log10(%v4f32 %a) {
+  ; CHECK: log
+  %1 = call %v4f32 @llvm.log10.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.log2:
+define %v4f32 @test_v4f32.log2(%v4f32 %a) {
+  ; CHECK: log
+  %1 = call %v4f32 @llvm.log2.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.fma:
+define %v4f32 @test_v4f32.fma(%v4f32 %a, %v4f32 %b, %v4f32 %c) {
+  ; CHECK: fma
+  %1 = call %v4f32 @llvm.fma.v4f32(%v4f32 %a, %v4f32 %b, %v4f32 %c)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.fabs:
+define %v4f32 @test_v4f32.fabs(%v4f32 %a) {
+  ; CHECK: fabs
+  %1 = call %v4f32 @llvm.fabs.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.floor:
+define %v4f32 @test_v4f32.floor(%v4f32 %a) {
+  ; CHECK: frintm.4s
+  %1 = call %v4f32 @llvm.floor.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.ceil:
+define %v4f32 @test_v4f32.ceil(%v4f32 %a) {
+  ; CHECK: frintp.4s
+  %1 = call %v4f32 @llvm.ceil.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.trunc:
+define %v4f32 @test_v4f32.trunc(%v4f32 %a) {
+  ; CHECK: frintz.4s
+  %1 = call %v4f32 @llvm.trunc.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.rint:
+define %v4f32 @test_v4f32.rint(%v4f32 %a) {
+  ; CHECK: frintx.4s
+  %1 = call %v4f32 @llvm.rint.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+; CHECK: test_v4f32.nearbyint:
+define %v4f32 @test_v4f32.nearbyint(%v4f32 %a) {
+  ; CHECK: frinti.4s
+  %1 = call %v4f32 @llvm.nearbyint.v4f32(%v4f32 %a)
+  ret %v4f32 %1
+}
+
+declare %v4f32 @llvm.sqrt.v4f32(%v4f32) #0
+declare %v4f32 @llvm.powi.v4f32(%v4f32, i32) #0
+declare %v4f32 @llvm.sin.v4f32(%v4f32) #0
+declare %v4f32 @llvm.cos.v4f32(%v4f32) #0
+declare %v4f32 @llvm.pow.v4f32(%v4f32, %v4f32) #0
+declare %v4f32 @llvm.exp.v4f32(%v4f32) #0
+declare %v4f32 @llvm.exp2.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log10.v4f32(%v4f32) #0
+declare %v4f32 @llvm.log2.v4f32(%v4f32) #0
+declare %v4f32 @llvm.fma.v4f32(%v4f32, %v4f32, %v4f32) #0
+declare %v4f32 @llvm.fabs.v4f32(%v4f32) #0
+declare %v4f32 @llvm.floor.v4f32(%v4f32) #0
+declare %v4f32 @llvm.ceil.v4f32(%v4f32) #0
+declare %v4f32 @llvm.trunc.v4f32(%v4f32) #0
+declare %v4f32 @llvm.rint.v4f32(%v4f32) #0
+declare %v4f32 @llvm.nearbyint.v4f32(%v4f32) #0
+
+;;; Double vector
+
+%v2f64 = type <2 x double>
+; CHECK: test_v2f64.sqrt:
+define %v2f64 @test_v2f64.sqrt(%v2f64 %a) {
+  ; CHECK: fsqrt.2d
+  %1 = call %v2f64 @llvm.sqrt.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.powi:
+define %v2f64 @test_v2f64.powi(%v2f64 %a, i32 %b) {
+  ; CHECK: pow
+  %1 = call %v2f64 @llvm.powi.v2f64(%v2f64 %a, i32 %b)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.sin:
+define %v2f64 @test_v2f64.sin(%v2f64 %a) {
+  ; CHECK: sin
+  %1 = call %v2f64 @llvm.sin.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.cos:
+define %v2f64 @test_v2f64.cos(%v2f64 %a) {
+  ; CHECK: cos
+  %1 = call %v2f64 @llvm.cos.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.pow:
+define %v2f64 @test_v2f64.pow(%v2f64 %a, %v2f64 %b) {
+  ; CHECK: pow
+  %1 = call %v2f64 @llvm.pow.v2f64(%v2f64 %a, %v2f64 %b)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.exp:
+define %v2f64 @test_v2f64.exp(%v2f64 %a) {
+  ; CHECK: exp
+  %1 = call %v2f64 @llvm.exp.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.exp2:
+define %v2f64 @test_v2f64.exp2(%v2f64 %a) {
+  ; CHECK: exp
+  %1 = call %v2f64 @llvm.exp2.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.log:
+define %v2f64 @test_v2f64.log(%v2f64 %a) {
+  ; CHECK: log
+  %1 = call %v2f64 @llvm.log.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.log10:
+define %v2f64 @test_v2f64.log10(%v2f64 %a) {
+  ; CHECK: log
+  %1 = call %v2f64 @llvm.log10.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.log2:
+define %v2f64 @test_v2f64.log2(%v2f64 %a) {
+  ; CHECK: log
+  %1 = call %v2f64 @llvm.log2.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.fma:
+define %v2f64 @test_v2f64.fma(%v2f64 %a, %v2f64 %b, %v2f64 %c) {
+  ; CHECK: fma
+  %1 = call %v2f64 @llvm.fma.v2f64(%v2f64 %a, %v2f64 %b, %v2f64 %c)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.fabs:
+define %v2f64 @test_v2f64.fabs(%v2f64 %a) {
+  ; CHECK: fabs
+  %1 = call %v2f64 @llvm.fabs.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.floor:
+define %v2f64 @test_v2f64.floor(%v2f64 %a) {
+  ; CHECK: frintm.2d
+  %1 = call %v2f64 @llvm.floor.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.ceil:
+define %v2f64 @test_v2f64.ceil(%v2f64 %a) {
+  ; CHECK: frintp.2d
+  %1 = call %v2f64 @llvm.ceil.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.trunc:
+define %v2f64 @test_v2f64.trunc(%v2f64 %a) {
+  ; CHECK: frintz.2d
+  %1 = call %v2f64 @llvm.trunc.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.rint:
+define %v2f64 @test_v2f64.rint(%v2f64 %a) {
+  ; CHECK: frintx.2d
+  %1 = call %v2f64 @llvm.rint.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+; CHECK: test_v2f64.nearbyint:
+define %v2f64 @test_v2f64.nearbyint(%v2f64 %a) {
+  ; CHECK: frinti.2d
+  %1 = call %v2f64 @llvm.nearbyint.v2f64(%v2f64 %a)
+  ret %v2f64 %1
+}
+
+declare %v2f64 @llvm.sqrt.v2f64(%v2f64) #0
+declare %v2f64 @llvm.powi.v2f64(%v2f64, i32) #0
+declare %v2f64 @llvm.sin.v2f64(%v2f64) #0
+declare %v2f64 @llvm.cos.v2f64(%v2f64) #0
+declare %v2f64 @llvm.pow.v2f64(%v2f64, %v2f64) #0
+declare %v2f64 @llvm.exp.v2f64(%v2f64) #0
+declare %v2f64 @llvm.exp2.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log10.v2f64(%v2f64) #0
+declare %v2f64 @llvm.log2.v2f64(%v2f64) #0
+declare %v2f64 @llvm.fma.v2f64(%v2f64, %v2f64, %v2f64) #0
+declare %v2f64 @llvm.fabs.v2f64(%v2f64) #0
+declare %v2f64 @llvm.floor.v2f64(%v2f64) #0
+declare %v2f64 @llvm.ceil.v2f64(%v2f64) #0
+declare %v2f64 @llvm.trunc.v2f64(%v2f64) #0
+declare %v2f64 @llvm.rint.v2f64(%v2f64) #0
+declare %v2f64 @llvm.nearbyint.v2f64(%v2f64) #0
+
+attributes #0 = { nounwind readonly }
diff --git a/test/CodeGen/ARM64/vhadd.ll b/test/CodeGen/ARM64/vhadd.ll
new file mode 100644
index 0000000000..aed76810e1
--- /dev/null
+++ b/test/CodeGen/ARM64/vhadd.ll
@@ -0,0 +1,249 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @shadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shadd8b:
+;CHECK: shadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.shadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @shadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shadd16b:
+;CHECK: shadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @shadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shadd4h:
+;CHECK: shadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @shadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shadd8h:
+;CHECK: shadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @shadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shadd2s:
+;CHECK: shadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @shadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shadd4s:
+;CHECK: shadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @uhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uhadd8b:
+;CHECK: uhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uhadd16b:
+;CHECK: uhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uhadd4h:
+;CHECK: uhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uhadd8h:
+;CHECK: uhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uhadd2s:
+;CHECK: uhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uhadd4s:
+;CHECK: uhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.shadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.shadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.shadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.uhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.shadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.shadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.shadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.uhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @srhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srhadd8b:
+;CHECK: srhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.srhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @srhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srhadd16b:
+;CHECK: srhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @srhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srhadd4h:
+;CHECK: srhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @srhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srhadd8h:
+;CHECK: srhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @srhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srhadd2s:
+;CHECK: srhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @srhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srhadd4s:
+;CHECK: srhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @urhadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: urhadd8b:
+;CHECK: urhadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.urhadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @urhadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: urhadd16b:
+;CHECK: urhadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @urhadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: urhadd4h:
+;CHECK: urhadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @urhadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: urhadd8h:
+;CHECK: urhadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @urhadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: urhadd2s:
+;CHECK: urhadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urhadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: urhadd4s:
+;CHECK: urhadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.srhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.srhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.srhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.urhadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.urhadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.urhadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.srhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.srhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.srhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.urhadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.urhadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.urhadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vhsub.ll b/test/CodeGen/ARM64/vhsub.ll
new file mode 100644
index 0000000000..85df4d4eb7
--- /dev/null
+++ b/test/CodeGen/ARM64/vhsub.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @shsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shsub8b:
+;CHECK: shsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.shsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @shsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shsub16b:
+;CHECK: shsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @shsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shsub4h:
+;CHECK: shsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @shsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shsub8h:
+;CHECK: shsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @shsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shsub2s:
+;CHECK: shsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @shsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shsub4s:
+;CHECK: shsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <8 x i8> @uhsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uhsub8b:
+;CHECK: uhsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.uhsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uhsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uhsub16b:
+;CHECK: uhsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uhsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uhsub4h:
+;CHECK: uhsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uhsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uhsub8h:
+;CHECK: uhsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uhsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uhsub2s:
+;CHECK: uhsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uhsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uhsub4s:
+;CHECK: uhsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.shsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.shsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.shsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.uhsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uhsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uhsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.shsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.shsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.shsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.uhsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uhsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uhsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/ARM64/virtual_base.ll b/test/CodeGen/ARM64/virtual_base.ll
new file mode 100644
index 0000000000..cb95954533
--- /dev/null
+++ b/test/CodeGen/ARM64/virtual_base.ll
@@ -0,0 +1,51 @@
+; RUN: llc < %s -O3 -march arm64 | FileCheck %s
+; <rdar://13463602>
+
+%struct.Counter_Struct = type { i64, i64 }
+%struct.Bicubic_Patch_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64, i32, i32, i32, [4 x [4 x [3 x double]]], [3 x double], double, double, %struct.Bezier_Node_Struct* }
+%struct.Method_Struct = type { i32 (%struct.Object_Struct*, %struct.Ray_Struct*, %struct.istack_struct*)*, i32 (double*, %struct.Object_Struct*)*, void (double*, %struct.Object_Struct*, %struct.istk_entry*)*, i8* (%struct.Object_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, double*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*, %struct.Transform_Struct*)*, void (%struct.Object_Struct*)*, void (%struct.Object_Struct*)* }
+%struct.Object_Struct = type { %struct.Method_Struct*, i32, %struct.Object_Struct*, %struct.Texture_Struct*, %struct.Interior_Struct*, %struct.Object_Struct*, %struct.Object_Struct*, %struct.Bounding_Box_Struct, i64 }
+%struct.Texture_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.9, %struct.Texture_Struct*, %struct.Pigment_Struct*, %struct.Tnormal_Struct*, %struct.Finish_Struct*, %struct.Texture_Struct*, i32 }
+%struct.Warps_Struct = type { i16, %struct.Warps_Struct* }
+%struct.Pattern_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.6 }
+%struct.Blend_Map_Struct = type { i16, i16, i16, i64, %struct.Blend_Map_Entry* }
+%struct.Blend_Map_Entry = type { float, i8, %union.anon }
+%union.anon = type { [2 x double], [8 x i8] }
+%union.anon.6 = type { %struct.anon.7 }
+%struct.anon.7 = type { float, [3 x double] }
+%union.anon.9 = type { %struct.anon.10 }
+%struct.anon.10 = type { float, [3 x double] }
+%struct.Pigment_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.0, [5 x float] }
+%union.anon.0 = type { %struct.anon }
+%struct.anon = type { float, [3 x double] }
+%struct.Tnormal_Struct = type { i16, i16, i16, i32, float, float, float, %struct.Warps_Struct*, %struct.Pattern_Struct*, %struct.Blend_Map_Struct*, %union.anon.3, float }
+%union.anon.3 = type { %struct.anon.4 }
+%struct.anon.4 = type { float, [3 x double] }
+%struct.Finish_Struct = type { float, float, float, float, float, float, float, float, float, float, float, float, float, float, float, [3 x float], [3 x float] }
+%struct.Interior_Struct = type { i64, i32, float, float, float, float, float, %struct.Media_Struct* }
+%struct.Media_Struct = type { i32, i32, i32, i32, i32, double, double, i32, i32, i32, i32, [5 x float], [5 x float], [5 x float], [5 x float], double, double, double, double*, %struct.Pigment_Struct*, %struct.Media_Struct* }
+%struct.Bounding_Box_Struct = type { [3 x float], [3 x float] }
+%struct.Ray_Struct = type { [3 x double], [3 x double], i32, [100 x %struct.Interior_Struct*] }
+%struct.istack_struct = type { %struct.istack_struct*, %struct.istk_entry*, i32 }
+%struct.istk_entry = type { double, [3 x double], [3 x double], %struct.Object_Struct*, i32, i32, double, double, i8* }
+%struct.Transform_Struct = type { [4 x [4 x double]], [4 x [4 x double]] }
+%struct.Bezier_Node_Struct = type { i32, [3 x double], double, i32, i8* }
+
+define void @Precompute_Patch_Values(%struct.Bicubic_Patch_Struct* %Shape) {
+; CHECK: Precompute_Patch_Values
+; CHECK: ldr [[VAL:x[0-9]+]], [x0, #288]
+; CHECK-NEXT: str [[VAL]], [sp, #232]
+; CHECK-NEXT: ldr [[VAL2:q[0-9]+]], [x0, #272]
+; CHECK-NEXT: stur [[VAL2]], {{\[}}sp, #216]
+entry:
+  %Control_Points = alloca [16 x [3 x double]], align 8
+  %arraydecay5.3.1 = getelementptr inbounds [16 x [3 x double]]* %Control_Points, i64 0, i64 9, i64 0
+  %tmp14 = bitcast double* %arraydecay5.3.1 to i8*
+  %arraydecay11.3.1 = getelementptr inbounds %struct.Bicubic_Patch_Struct* %Shape, i64 0, i32 12, i64 1, i64 3, i64 0
+  %tmp15 = bitcast double* %arraydecay11.3.1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp14, i8* %tmp15, i64 24, i32 1, i1 false)
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1)
diff --git a/test/CodeGen/ARM64/vmax.ll b/test/CodeGen/ARM64/vmax.ll
new file mode 100644
index 0000000000..b2426f3505
--- /dev/null
+++ b/test/CodeGen/ARM64/vmax.ll
@@ -0,0 +1,679 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @smax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smax_8b:
+;CHECK: smax.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smax_16b:
+;CHECK: smax.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smax_4h:
+;CHECK: smax.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smax_8h:
+;CHECK: smax.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smax_2s:
+;CHECK: smax.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smax_4s:
+;CHECK: smax.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.smax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.smax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.smax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.smax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.smax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.smax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umax_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umax_8b:
+;CHECK: umax.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umax_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umax_16b:
+;CHECK: umax.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umax_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umax_4h:
+;CHECK: umax.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umax_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umax_8h:
+;CHECK: umax.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umax_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umax_2s:
+;CHECK: umax.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umax_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umax_4s:
+;CHECK: umax.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.umax.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.umax.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.umax.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.umax.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.umax.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.umax.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @smin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smin_8b:
+;CHECK: smin.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smin_16b:
+;CHECK: smin.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smin_4h:
+;CHECK: smin.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smin_8h:
+;CHECK: smin.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smin_2s:
+;CHECK: smin.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smin_4s:
+;CHECK: smin.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.smin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.smin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.smin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.smin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.smin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.smin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umin_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umin_8b:
+;CHECK: umin.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umin_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umin_16b:
+;CHECK: umin.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umin_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umin_4h:
+;CHECK: umin.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umin_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umin_8h:
+;CHECK: umin.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umin_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umin_2s:
+;CHECK: umin.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umin_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umin_4s:
+;CHECK: umin.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.umin.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.umin.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.umin.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.umin.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.umin.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.umin.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @smaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smaxp_8b:
+;CHECK: smaxp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @smaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: smaxp_16b:
+;CHECK: smaxp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @smaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smaxp_4h:
+;CHECK: smaxp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @smaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: smaxp_8h:
+;CHECK: smaxp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @smaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smaxp_2s:
+;CHECK: smaxp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @smaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: smaxp_4s:
+;CHECK: smaxp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.smaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.smaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.smaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.smaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.smaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.smaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @umaxp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umaxp_8b:
+;CHECK: umaxp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @umaxp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: umaxp_16b:
+;CHECK: umaxp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @umaxp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umaxp_4h:
+;CHECK: umaxp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @umaxp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: umaxp_8h:
+;CHECK: umaxp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @umaxp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umaxp_2s:
+;CHECK: umaxp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @umaxp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: umaxp_4s:
+;CHECK: umaxp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.umaxp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.umaxp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.umaxp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.umaxp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.umaxp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.umaxp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sminp_8b:
+;CHECK: sminp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @sminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sminp_16b:
+;CHECK: sminp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @sminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sminp_4h:
+;CHECK: sminp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sminp_8h:
+;CHECK: sminp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sminp_2s:
+;CHECK: sminp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sminp_4s:
+;CHECK: sminp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.sminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.sminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i8> @uminp_8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uminp_8b:
+;CHECK: uminp.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <16 x i8> @uminp_16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uminp_16b:
+;CHECK: uminp.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <4 x i16> @uminp_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uminp_4h:
+;CHECK: uminp.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @uminp_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uminp_8h:
+;CHECK: uminp.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @uminp_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uminp_2s:
+;CHECK: uminp.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @uminp_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uminp_4s:
+;CHECK: uminp.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+declare <8 x i8> @llvm.arm64.neon.uminp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <16 x i8> @llvm.arm64.neon.uminp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uminp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uminp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uminp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uminp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <2 x float> @fmax_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmax_2s:
+;CHECK: fmax.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmax_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmax_4s:
+;CHECK: fmax.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmax_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmax_2d:
+;CHECK: fmax.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fmax.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fmax.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fmax.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmaxp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxp_2s:
+;CHECK: fmaxp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmaxp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxp_4s:
+;CHECK: fmaxp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmaxp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmaxp_2d:
+;CHECK: fmaxp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fmaxp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fmaxp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fmaxp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmin_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmin_2s:
+;CHECK: fmin.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmin_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmin_4s:
+;CHECK: fmin.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmin_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmin_2d:
+;CHECK: fmin.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fmin.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fmin.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fmin.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fminp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fminp_2s:
+;CHECK: fminp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fminp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fminp_4s:
+;CHECK: fminp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fminp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fminp_2d:
+;CHECK: fminp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fminp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fminp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fminp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fminnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fminnmp_2s:
+;CHECK: fminnmp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fminnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fminnmp_4s:
+;CHECK: fminnmp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fminnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fminnmp_2d:
+;CHECK: fminnmp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fminnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fminnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fminnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmaxnmp_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_2s:
+;CHECK: fmaxnmp.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmaxnmp_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_4s:
+;CHECK: fmaxnmp.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmaxnmp_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmaxnmp_2d:
+;CHECK: fmaxnmp.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fmaxnmp.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fmaxnmp.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fmaxnmp.v2f64(<2 x double>, <2 x double>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vminmaxnm.ll b/test/CodeGen/ARM64/vminmaxnm.ll
new file mode 100644
index 0000000000..628640759a
--- /dev/null
+++ b/test/CodeGen/ARM64/vminmaxnm.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @f1(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vmaxnm2.i
+}
+
+define <4 x float> @f2(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.4s	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vmaxnm2.i
+}
+
+define <2 x double> @f3(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fmaxnm.2d	v0, v0, v1
+; CHECK: ret
+  %vmaxnm2.i = tail call <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vmaxnm2.i
+}
+
+define <2 x float> @f4(<2 x float> %a, <2 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.2s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float> %a, <2 x float> %b) nounwind
+  ret <2 x float> %vminnm2.i
+}
+
+define <4 x float> @f5(<4 x float> %a, <4 x float> %b) nounwind readnone ssp {
+; CHECK: fminnm.4s	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float> %a, <4 x float> %b) nounwind
+  ret <4 x float> %vminnm2.i
+}
+
+define <2 x double> @f6(<2 x double> %a, <2 x double> %b) nounwind readnone ssp {
+; CHECK: fminnm.2d	v0, v0, v1
+; CHECK: ret
+  %vminnm2.i = tail call <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double> %a, <2 x double> %b) nounwind
+  ret <2 x double> %vminnm2.i
+}
+
+declare <2 x double> @llvm.arm64.neon.fminnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fminnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.arm64.neon.fminnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fmaxnm.v2f64(<2 x double>, <2 x double>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x float> @llvm.arm64.neon.fmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone
+
+
+define double @test_fmaxnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fmaxnmv:
+; CHECK: fmaxnmp.2d d0, v0
+  %max = call double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double> %in)
+  ret double %max
+}
+
+define double @test_fminnmv(<2 x double> %in) {
+; CHECK-LABEL: test_fminnmv:
+; CHECK: fminnmp.2d d0, v0
+  %min = call double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double> %in)
+  ret double %min
+}
+
+declare double @llvm.arm64.neon.fmaxnmv.f64.v2f64(<2 x double>)
+declare double @llvm.arm64.neon.fminnmv.f64.v2f64(<2 x double>)
diff --git a/test/CodeGen/ARM64/vmovn.ll b/test/CodeGen/ARM64/vmovn.ll
new file mode 100644
index 0000000000..675633b6cf
--- /dev/null
+++ b/test/CodeGen/ARM64/vmovn.ll
@@ -0,0 +1,242 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @xtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: xtn8b:
+;CHECK-NOT: ld1
+;CHECK: xtn.8b v0, v0
+;CHECK-NEXT: ret
+  %tmp3 = trunc <8 x i16> %A to <8 x i8>
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @xtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: xtn4h:
+;CHECK-NOT: ld1
+;CHECK: xtn.4h v0, v0
+;CHECK-NEXT: ret
+  %tmp3 = trunc <4 x i32> %A to <4 x i16>
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @xtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: xtn2s:
+;CHECK-NOT: ld1
+;CHECK: xtn.2s v0, v0
+;CHECK-NEXT: ret
+  %tmp3 = trunc <2 x i64> %A to <2 x i32>
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @xtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: xtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: xtn2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = trunc <8 x i16> %A to <8 x i8>
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @xtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: xtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: xtn2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = trunc <4 x i32> %A to <4 x i16>
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @xtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: xtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: xtn2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = trunc <2 x i64> %A to <2 x i32>
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+define <8 x i8> @sqxtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtn8b:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.8b v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqxtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtn4h:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.4h v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqxtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtn2s:
+;CHECK-NOT: ld1
+;CHECK: sqxtn.2s v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtn.v8i8(<8 x i16> %A)
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @sqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32> %A)
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @sqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: sqxtn2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64> %A)
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqxtn.v2i32(<2 x i64>) nounwind readnone
+
+define <8 x i8> @uqxtn8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: uqxtn8b:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.8b v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqxtn4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: uqxtn4h:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.4h v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqxtn2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: uqxtn2s:
+;CHECK-NOT: ld1
+;CHECK: uqxtn.2s v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqxtn2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: uqxtn2_16b:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqxtn.v8i8(<8 x i16> %A)
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @uqxtn2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: uqxtn2_8h:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32> %A)
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @uqxtn2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: uqxtn2_4s:
+;CHECK-NOT: ld1
+;CHECK: uqxtn2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64> %A)
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+declare <8 x i8>  @llvm.arm64.neon.uqxtn.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqxtn.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqxtn.v2i32(<2 x i64>) nounwind readnone
+
+define <8 x i8> @sqxtun8b(<8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtun8b:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.8b v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqxtun4h(<4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtun4h:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.4h v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqxtun2s(<2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtun2s:
+;CHECK-NOT: ld1
+;CHECK: sqxtun.2s v0, v0
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqxtun2_16b(<8 x i8> %ret, <8 x i16> %A) nounwind {
+;CHECK-LABEL: sqxtun2_16b:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.16b v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqxtun.v8i8(<8 x i16> %A)
+        %res = shufflevector <8 x i8> %ret, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %res
+}
+
+define <8 x i16> @sqxtun2_8h(<4 x i16> %ret, <4 x i32> %A) nounwind {
+;CHECK-LABEL: sqxtun2_8h:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.8h v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32> %A)
+        %res = shufflevector <4 x i16> %ret, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @sqxtun2_4s(<2 x i32> %ret, <2 x i64> %A) nounwind {
+;CHECK-LABEL: sqxtun2_4s:
+;CHECK-NOT: ld1
+;CHECK: sqxtun2.4s v0, v1
+;CHECK-NEXT: ret
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64> %A)
+        %res = shufflevector <2 x i32> %ret, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %res
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqxtun.v8i8(<8 x i16>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqxtun.v4i16(<4 x i32>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqxtun.v2i32(<2 x i64>) nounwind readnone
+
diff --git a/test/CodeGen/ARM64/vmul.ll b/test/CodeGen/ARM64/vmul.ll
new file mode 100644
index 0000000000..aeaea98f93
--- /dev/null
+++ b/test/CodeGen/ARM64/vmul.ll
@@ -0,0 +1,1969 @@
+; RUN: llc -asm-verbose=false < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+
+define <8 x i16> @smull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: smull8h:
+;CHECK: smull.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @smull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smull4s:
+;CHECK: smull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @smull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smull2d:
+;CHECK: smull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+declare <8 x i16>  @llvm.arm64.neon.smull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @umull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: umull8h:
+;CHECK: umull.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @umull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umull4s:
+;CHECK: umull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @umull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umull2d:
+;CHECK: umull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+declare <8 x i16>  @llvm.arm64.neon.umull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <4 x i32> @sqdmull4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull4s:
+;CHECK: sqdmull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqdmull2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2d:
+;CHECK: sqdmull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+define <4 x i32> @sqdmull2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_4s:
+;CHECK: sqdmull2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqdmull2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_2d:
+;CHECK: sqdmull2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp3 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp3
+}
+
+
+declare <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone
+
+define <8 x i16> @pmull8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: pmull8h:
+;CHECK: pmull.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+declare <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8>, <8 x i8>) nounwind readnone
+
+define <4 x i16> @sqdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_4h:
+;CHECK: sqdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_8h:
+;CHECK: sqdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_2s:
+;CHECK: sqdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_4s:
+;CHECK: sqdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define i32 @sqdmulh_1s(i32* %A, i32* %B) nounwind {
+;CHECK-LABEL: sqdmulh_1s:
+;CHECK: sqdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
+  %tmp1 = load i32* %A
+  %tmp2 = load i32* %B
+  %tmp3 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %tmp1, i32 %tmp2)
+  ret i32 %tmp3
+}
+
+declare <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.arm64.neon.sqdmulh.i32(i32, i32) nounwind readnone
+
+define <4 x i16> @sqrdmulh_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_4h:
+;CHECK: sqrdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i16> %tmp3
+}
+
+define <8 x i16> @sqrdmulh_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_8h:
+;CHECK: sqrdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+  ret <8 x i16> %tmp3
+}
+
+define <2 x i32> @sqrdmulh_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_2s:
+;CHECK: sqrdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @sqrdmulh_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_4s:
+;CHECK: sqrdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+  ret <4 x i32> %tmp3
+}
+
+define i32 @sqrdmulh_1s(i32* %A, i32* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_1s:
+;CHECK: sqrdmulh s0, {{s[0-9]+}}, {{s[0-9]+}}
+  %tmp1 = load i32* %A
+  %tmp2 = load i32* %B
+  %tmp3 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %tmp1, i32 %tmp2)
+  ret i32 %tmp3
+}
+
+declare <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare i32 @llvm.arm64.neon.sqrdmulh.i32(i32, i32) nounwind readnone
+
+define <2 x float> @fmulx_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_2s:
+;CHECK: fmulx.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+  ret <2 x float> %tmp3
+}
+
+define <4 x float> @fmulx_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_4s:
+;CHECK: fmulx.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+  ret <4 x float> %tmp3
+}
+
+define <2 x double> @fmulx_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmulx_2d:
+;CHECK: fmulx.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+  ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <4 x i32> @smlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlal4s:
+;CHECK: smlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlal2d:
+;CHECK: smlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @smlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlsl4s:
+;CHECK: smlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @smlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlsl2d:
+;CHECK: smlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>)
+declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>)
+
+define <4 x i32> @sqdmlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal4s:
+;CHECK: sqdmlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2d:
+;CHECK: sqdmlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlal2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_4s:
+;CHECK: sqdmlal2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlal2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_2d:
+;CHECK: sqdmlal2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl4s:
+;CHECK: sqdmlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2d:
+;CHECK: sqdmlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @sqdmlsl2_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_4s:
+;CHECK: sqdmlsl2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp4)
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @sqdmlsl2_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_2d:
+;CHECK: sqdmlsl2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp4)
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @umlal4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlal4s:
+;CHECK: umlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = add <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umlal2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlal2d:
+;CHECK: umlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = add <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <4 x i32> @umlsl4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlsl4s:
+;CHECK: umlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @umlsl2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlsl2d:
+;CHECK: umlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <2 x float> @fmla_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmla_2s:
+;CHECK: fmla.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp2, <2 x float> %tmp3)
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmla_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmla_4s:
+;CHECK: fmla.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp2, <4 x float> %tmp3)
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmla_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmla_2d:
+;CHECK: fmla.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp2, <2 x double> %tmp3)
+  ret <2 x double> %tmp4
+}
+
+declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @fmls_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_2s:
+;CHECK: fmls.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp1, <2 x float> %tmp4, <2 x float> %tmp3)
+  ret <2 x float> %tmp5
+}
+
+define <4 x float> @fmls_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_4s:
+;CHECK: fmls.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp1, <4 x float> %tmp4, <4 x float> %tmp3)
+  ret <4 x float> %tmp5
+}
+
+define <2 x double> @fmls_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmls_2d:
+;CHECK: fmls.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
+  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp1, <2 x double> %tmp4, <2 x double> %tmp3)
+  ret <2 x double> %tmp5
+}
+
+define <2 x float> @fmls_commuted_neg_2s(<2 x float>* %A, <2 x float>* %B, <2 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_2s:
+;CHECK: fmls.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = load <2 x float>* %C
+  %tmp4 = fsub <2 x float> <float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <2 x float> @llvm.fma.v2f32(<2 x float> %tmp4, <2 x float> %tmp1, <2 x float> %tmp3)
+  ret <2 x float> %tmp5
+}
+
+define <4 x float> @fmls_commuted_neg_4s(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_4s:
+;CHECK: fmls.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = load <4 x float>* %C
+  %tmp4 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %tmp2
+  %tmp5 = call <4 x float> @llvm.fma.v4f32(<4 x float> %tmp4, <4 x float> %tmp1, <4 x float> %tmp3)
+  ret <4 x float> %tmp5
+}
+
+define <2 x double> @fmls_commuted_neg_2d(<2 x double>* %A, <2 x double>* %B, <2 x double>* %C) nounwind {
+;CHECK-LABEL: fmls_commuted_neg_2d:
+;CHECK: fmls.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = load <2 x double>* %C
+  %tmp4 = fsub <2 x double> <double -0.0, double -0.0>, %tmp2
+  %tmp5 = call <2 x double> @llvm.fma.v2f64(<2 x double> %tmp4, <2 x double> %tmp1, <2 x double> %tmp3)
+  ret <2 x double> %tmp5
+}
+
+define <2 x float> @fmls_indexed_2s(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_2s:
+;CHECK: fmls.2s
+entry:
+  %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %c
+  %lane = shufflevector <2 x float> %b, <2 x float> undef, <2 x i32> zeroinitializer
+  %fmls1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %lane, <2 x float> %a)
+  ret <2 x float> %fmls1
+}
+
+define <4 x float> @fmls_indexed_4s(<4 x float> %a, <4 x float> %b, <4 x float> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_4s:
+;CHECK: fmls.4s
+entry:
+  %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %c
+  %lane = shufflevector <4 x float> %b, <4 x float> undef, <4 x i32> zeroinitializer
+  %fmls1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %lane, <4 x float> %a)
+  ret <4 x float> %fmls1
+}
+
+define <2 x double> @fmls_indexed_2d(<2 x double> %a, <2 x double> %b, <2 x double> %c) nounwind readnone ssp {
+;CHECK-LABEL: fmls_indexed_2d:
+;CHECK: fmls.2d
+entry:
+  %0 = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %c
+  %lane = shufflevector <2 x double> %b, <2 x double> undef, <2 x i32> zeroinitializer
+  %fmls1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %0, <2 x double> %lane, <2 x double> %a)
+  ret <2 x double> %fmls1
+}
+
+define <2 x float> @fmla_indexed_scalar_2s(<2 x float> %a, <2 x float> %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fmla_indexed_scalar_2s:
+; CHECK-NEXT: fmla.2s
+; CHECK-NEXT: ret
+  %v1 = insertelement <2 x float> undef, float %c, i32 0
+  %v2 = insertelement <2 x float> %v1, float %c, i32 1
+  %fmla1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %v1, <2 x float> %b, <2 x float> %a) nounwind
+  ret <2 x float> %fmla1
+}
+
+define <4 x float> @fmla_indexed_scalar_4s(<4 x float> %a, <4 x float> %b, float %c) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: fmla_indexed_scalar_4s:
+; CHECK-NEXT: fmla.4s
+; CHECK-NEXT: ret
+  %v1 = insertelement <4 x float> undef, float %c, i32 0
+  %v2 = insertelement <4 x float> %v1, float %c, i32 1
+  %v3 = insertelement <4 x float> %v2, float %c, i32 2
+  %v4 = insertelement <4 x float> %v3, float %c, i32 3
+  %fmla1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %v4, <4 x float> %b, <4 x float> %a) nounwind
+  ret <4 x float> %fmla1
+}
+
+define <2 x double> @fmla_indexed_scalar_2d(<2 x double> %a, <2 x double> %b, double %c) nounwind readnone ssp {
+; CHECK-LABEL: fmla_indexed_scalar_2d:
+; CHECK-NEXT: fmla.2d
+; CHECK-NEXT: ret
+entry:
+  %v1 = insertelement <2 x double> undef, double %c, i32 0
+  %v2 = insertelement <2 x double> %v1, double %c, i32 1
+  %fmla1 = tail call <2 x double> @llvm.fma.v2f64(<2 x double> %v2, <2 x double> %b, <2 x double> %a) nounwind
+  ret <2 x double> %fmla1
+}
+
+define <4 x i16> @mul_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: mul_4h:
+;CHECK-NOT: dup
+;CHECK: mul.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = mul <4 x i16> %tmp1, %tmp3
+  ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @mul_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: mul_8h:
+;CHECK-NOT: dup
+;CHECK: mul.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = mul <8 x i16> %tmp1, %tmp3
+  ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @mul_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: mul_2s:
+;CHECK-NOT: dup
+;CHECK: mul.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = mul <2 x i32> %tmp1, %tmp3
+  ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @mul_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: mul_4s:
+;CHECK-NOT: dup
+;CHECK: mul.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = mul <4 x i32> %tmp1, %tmp3
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @mul_2d(<2 x i64> %A, <2 x i64> %B) nounwind {
+; CHECK-LABEL: mul_2d:
+; CHECK: mul
+; CHECK: mul
+  %tmp1 = mul <2 x i64> %A, %B
+  ret <2 x i64> %tmp1
+}
+
+define <2 x float> @fmul_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_2s:
+;CHECK-NOT: dup
+;CHECK: fmul.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = fmul <2 x float> %tmp1, %tmp3
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmul_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_4s:
+;CHECK-NOT: dup
+;CHECK: fmul.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = fmul <4 x float> %tmp1, %tmp3
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmul_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmul_lane_2d:
+;CHECK-NOT: dup
+;CHECK: fmul.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = fmul <2 x double> %tmp1, %tmp3
+  ret <2 x double> %tmp4
+}
+
+define float @fmul_lane_s(float %A, <4 x float> %vec) nounwind {
+;CHECK-LABEL: fmul_lane_s:
+;CHECK-NOT: dup
+;CHECK: fmul.s s0, s0, v1[3]
+  %B = extractelement <4 x float> %vec, i32 3
+  %res = fmul float %A, %B
+  ret float %res
+}
+
+define double @fmul_lane_d(double %A, <2 x double> %vec) nounwind {
+;CHECK-LABEL: fmul_lane_d:
+;CHECK-NOT: dup
+;CHECK: fmul.d d0, d0, v1[1]
+  %B = extractelement <2 x double> %vec, i32 1
+  %res = fmul double %A, %B
+  ret double %res
+}
+
+
+
+define <2 x float> @fmulx_lane_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_2s:
+;CHECK-NOT: dup
+;CHECK: fmulx.2s
+  %tmp1 = load <2 x float>* %A
+  %tmp2 = load <2 x float>* %B
+  %tmp3 = shufflevector <2 x float> %tmp2, <2 x float> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x float> @llvm.arm64.neon.fmulx.v2f32(<2 x float> %tmp1, <2 x float> %tmp3)
+  ret <2 x float> %tmp4
+}
+
+define <4 x float> @fmulx_lane_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_4s:
+;CHECK-NOT: dup
+;CHECK: fmulx.4s
+  %tmp1 = load <4 x float>* %A
+  %tmp2 = load <4 x float>* %B
+  %tmp3 = shufflevector <4 x float> %tmp2, <4 x float> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x float> @llvm.arm64.neon.fmulx.v4f32(<4 x float> %tmp1, <4 x float> %tmp3)
+  ret <4 x float> %tmp4
+}
+
+define <2 x double> @fmulx_lane_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: fmulx_lane_2d:
+;CHECK-NOT: dup
+;CHECK: fmulx.2d
+  %tmp1 = load <2 x double>* %A
+  %tmp2 = load <2 x double>* %B
+  %tmp3 = shufflevector <2 x double> %tmp2, <2 x double> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x double> @llvm.arm64.neon.fmulx.v2f64(<2 x double> %tmp1, <2 x double> %tmp3)
+  ret <2 x double> %tmp4
+}
+
+define <4 x i16> @sqdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_4h:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @sqdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_8h:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @sqdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_2s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @sqdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define i32 @sqdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
+;CHECK-LABEL: sqdmulh_lane_1s:
+;CHECK-NOT: dup
+;CHECK: sqdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
+  %tmp1 = extractelement <4 x i32> %B, i32 1
+  %tmp2 = call i32 @llvm.arm64.neon.sqdmulh.i32(i32 %A, i32 %tmp1)
+  ret i32 %tmp2
+}
+
+define <4 x i16> @sqrdmulh_lane_4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_4h:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.4h
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i16> @llvm.arm64.neon.sqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i16> %tmp4
+}
+
+define <8 x i16> @sqrdmulh_lane_8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_8h:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.8h
+  %tmp1 = load <8 x i16>* %A
+  %tmp2 = load <8 x i16>* %B
+  %tmp3 = shufflevector <8 x i16> %tmp2, <8 x i16> %tmp2, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <8 x i16> @llvm.arm64.neon.sqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp3)
+  ret <8 x i16> %tmp4
+}
+
+define <2 x i32> @sqrdmulh_lane_2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_2s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.2s
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i32> @llvm.arm64.neon.sqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i32> %tmp4
+}
+
+define <4 x i32> @sqrdmulh_lane_4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.4s
+  %tmp1 = load <4 x i32>* %A
+  %tmp2 = load <4 x i32>* %B
+  %tmp3 = shufflevector <4 x i32> %tmp2, <4 x i32> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define i32 @sqrdmulh_lane_1s(i32 %A, <4 x i32> %B) nounwind {
+;CHECK-LABEL: sqrdmulh_lane_1s:
+;CHECK-NOT: dup
+;CHECK: sqrdmulh.s s0, {{s[0-9]+}}, {{v[0-9]+}}[1]
+  %tmp1 = extractelement <4 x i32> %B, i32 1
+  %tmp2 = call i32 @llvm.arm64.neon.sqrdmulh.i32(i32 %A, i32 %tmp1)
+  ret i32 %tmp2
+}
+
+define <4 x i32> @sqdmull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sqdmull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @sqdmull2_lane_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmull2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @sqdmull2_lane_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqdmull2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmull2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @umull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: umull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @umull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: umull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @smull_lane_4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: smull_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smull.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp4 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3)
+  ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @smull_lane_2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: smull_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smull.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp4 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3)
+  ret <2 x i64> %tmp4
+}
+
+define <4 x i32> @smlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = add <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @smlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = add <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlal2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlal2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlal2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlal2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlal2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define i32 @sqdmlal_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_1s:
+;CHECK: sqdmlal.4s
+  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
+  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod = extractelement <4 x i32> %prod.vec, i32 0
+  %res = call i32 @llvm.arm64.neon.sqadd.i32(i32 %A, i32 %prod)
+  ret i32 %res
+}
+declare i32 @llvm.arm64.neon.sqadd.i32(i32, i32)
+
+define i32 @sqdmlsl_lane_1s(i32 %A, i16 %B, <4 x i16> %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_1s:
+;CHECK: sqdmlsl.4s
+  %lhs = insertelement <4 x i16> undef, i16 %B, i32 0
+  %rhs = shufflevector <4 x i16> %C, <4 x i16> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %prod.vec = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %lhs, <4 x i16> %rhs)
+  %prod = extractelement <4 x i32> %prod.vec, i32 0
+  %res = call i32 @llvm.arm64.neon.sqsub.i32(i32 %A, i32 %prod)
+  ret i32 %res
+}
+declare i32 @llvm.arm64.neon.sqsub.i32(i32, i32)
+
+define i64 @sqdmlal_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: sqdmlal_lane_1d:
+;CHECK: sqdmlal.s
+  %rhs = extractelement <2 x i32> %C, i32 1
+  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.arm64.neon.sqadd.i64(i64 %A, i64 %prod)
+  ret i64 %res
+}
+declare i64 @llvm.arm64.neon.sqdmulls.scalar(i32, i32)
+declare i64 @llvm.arm64.neon.sqadd.i64(i64, i64)
+
+define i64 @sqdmlsl_lane_1d(i64 %A, i32 %B, <2 x i32> %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_1d:
+;CHECK: sqdmlsl.s
+  %rhs = extractelement <2 x i32> %C, i32 1
+  %prod = call i64 @llvm.arm64.neon.sqdmulls.scalar(i32 %B, i32 %rhs)
+  %res = call i64 @llvm.arm64.neon.sqsub.i64(i64 %A, i64 %prod)
+  ret i64 %res
+}
+declare i64 @llvm.arm64.neon.sqsub.i64(i64, i64)
+
+
+define <4 x i32> @umlal_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlal_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umlal.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = add <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @umlal_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlal_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umlal.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = add <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+
+define <4 x i32> @smlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: smlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: smlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = sub <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @smlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: smlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: smlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = sub <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @sqdmlsl2_lane_4s(<8 x i16>* %A, <8 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_lane_4s:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl2.4s
+  %load1 = load <8 x i16>* %A
+  %load2 = load <8 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %tmp2 = shufflevector <8 x i16> %load2, <8 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.sqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2)
+  %tmp6 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp3, <4 x i32> %tmp5)
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @sqdmlsl2_lane_2d(<4 x i32>* %A, <4 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: sqdmlsl2_lane_2d:
+;CHECK-NOT: dup
+;CHECK: sqdmlsl2.2d
+  %load1 = load <4 x i32>* %A
+  %load2 = load <4 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %tmp2 = shufflevector <4 x i32> %load2, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2)
+  %tmp6 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp3, <2 x i64> %tmp5)
+  ret <2 x i64> %tmp6
+}
+
+define <4 x i32> @umlsl_lane_4s(<4 x i16>* %A, <4 x i16>* %B, <4 x i32>* %C) nounwind {
+;CHECK-LABEL: umlsl_lane_4s:
+;CHECK-NOT: dup
+;CHECK: umlsl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = load <4 x i32>* %C
+  %tmp4 = shufflevector <4 x i16> %tmp2, <4 x i16> %tmp2, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp5 = call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp4)
+  %tmp6 = sub <4 x i32> %tmp3, %tmp5
+  ret <4 x i32> %tmp6
+}
+
+define <2 x i64> @umlsl_lane_2d(<2 x i32>* %A, <2 x i32>* %B, <2 x i64>* %C) nounwind {
+;CHECK-LABEL: umlsl_lane_2d:
+;CHECK-NOT: dup
+;CHECK: umlsl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = load <2 x i64>* %C
+  %tmp4 = shufflevector <2 x i32> %tmp2, <2 x i32> %tmp2, <2 x i32> <i32 1, i32 1>
+  %tmp5 = call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp4)
+  %tmp6 = sub <2 x i64> %tmp3, %tmp5
+  ret <2 x i64> %tmp6
+}
+
+; Scalar FMULX
+define float @fmulxs(float %a, float %b) nounwind {
+; CHECK-LABEL: fmulxs:
+; CHECKNEXT: fmulx s0, s0, s1
+  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
+; CHECKNEXT: ret
+  ret float %fmulx.i
+}
+
+define double @fmulxd(double %a, double %b) nounwind {
+; CHECK-LABEL: fmulxd:
+; CHECKNEXT: fmulx d0, d0, d1
+  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
+; CHECKNEXT: ret
+  ret double %fmulx.i
+}
+
+define float @fmulxs_lane(float %a, <4 x float> %vec) nounwind {
+; CHECK-LABEL: fmulxs_lane:
+; CHECKNEXT: fmulx.s s0, s0, v1[3]
+  %b = extractelement <4 x float> %vec, i32 3
+  %fmulx.i = tail call float @llvm.arm64.neon.fmulx.f32(float %a, float %b) nounwind
+; CHECKNEXT: ret
+  ret float %fmulx.i
+}
+
+define double @fmulxd_lane(double %a, <2 x double> %vec) nounwind {
+; CHECK-LABEL: fmulxd_lane:
+; CHECKNEXT: fmulx d0, d0, v1[1]
+  %b = extractelement <2 x double> %vec, i32 1
+  %fmulx.i = tail call double @llvm.arm64.neon.fmulx.f64(double %a, double %b) nounwind
+; CHECKNEXT: ret
+  ret double %fmulx.i
+}
+
+declare double @llvm.arm64.neon.fmulx.f64(double, double) nounwind readnone
+declare float @llvm.arm64.neon.fmulx.f32(float, float) nounwind readnone
+
+
+define <8 x i16> @smull2_8h_simple(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: smull2_8h_simple:
+; CHECK-NEXT: smull2.8h v0, v0, v1
+; CHECK-NEXT: ret
+  %1 = shufflevector <16 x i8> %a, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i8> %b, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %3 = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %1, <8 x i8> %2) #2
+  ret <8 x i16> %3
+}
+
+define <8 x i16> @foo0(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: foo0:
+; CHECK: smull2.8h v0, v0, v1
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
+  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @foo1(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: foo1:
+; CHECK: smull2.4s v0, v0, v1
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @foo2(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: foo2:
+; CHECK: smull2.2d v0, v0, v1
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <8 x i16> @foo3(<16 x i8> %a, <16 x i8> %b) nounwind {
+; CHECK-LABEL: foo3:
+; CHECK: umull2.8h v0, v0, v1
+  %tmp = bitcast <16 x i8> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <8 x i8>
+  %vmull.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  ret <8 x i16> %vmull.i.i
+}
+
+define <4 x i32> @foo4(<8 x i16> %a, <8 x i16> %b) nounwind {
+; CHECK-LABEL: foo4:
+; CHECK: umull2.4s v0, v0, v1
+  %tmp = bitcast <8 x i16> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @foo5(<4 x i32> %a, <4 x i32> %b) nounwind {
+; CHECK-LABEL: foo5:
+; CHECK: umull2.2d v0, v0, v1
+  %tmp = bitcast <4 x i32> %a to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @foo6(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo6:
+; CHECK-NEXT: smull2.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @foo7(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo7:
+; CHECK-NEXT: smull2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @foo8(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo8:
+; CHECK-NEXT: umull2.4s v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <4 x i16>
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %shuffle) nounwind
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @foo9(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK-LABEL: foo9:
+; CHECK-NEXT: umull2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+entry:
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %shuffle) nounwind
+  ret <2 x i64> %vmull2.i
+}
+
+define <8 x i16> @bar0(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
+; CHECK-LABEL: bar0:
+; CHECK: smlal2.8h v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.smull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %add.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @bar1(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
+; CHECK-LABEL: bar1:
+; CHECK: smlal2.4s v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @bar2(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
+; CHECK-LABEL: bar2:
+; CHECK: smlal2.2d v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <8 x i16> @bar3(<8 x i16> %a, <16 x i8> %b, <16 x i8> %c) nounwind {
+; CHECK-LABEL: bar3:
+; CHECK: umlal2.8h v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <16 x i8> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
+  %tmp2 = bitcast <16 x i8> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <8 x i8>
+  %vmull.i.i.i = tail call <8 x i16> @llvm.arm64.neon.umull.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp3) nounwind
+  %add.i = add <8 x i16> %vmull.i.i.i, %a
+  ret <8 x i16> %add.i
+}
+
+define <4 x i32> @bar4(<4 x i32> %a, <8 x i16> %b, <8 x i16> %c) nounwind {
+; CHECK-LABEL: bar4:
+; CHECK: umlal2.4s v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <4 x i16>
+  %vmull2.i.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add.i = add <4 x i32> %vmull2.i.i.i, %a
+  ret <4 x i32> %add.i
+}
+
+define <2 x i64> @bar5(<2 x i64> %a, <4 x i32> %b, <4 x i32> %c) nounwind {
+; CHECK-LABEL: bar5:
+; CHECK: umlal2.2d v0, v1, v2
+; CHECK-NEXT: ret
+
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %c to <2 x i64>
+  %shuffle.i3.i.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i.i to <2 x i32>
+  %vmull2.i.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add.i = add <2 x i64> %vmull2.i.i.i, %a
+  ret <2 x i64> %add.i
+}
+
+define <4 x i32> @mlal2_1(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
+; CHECK-LABEL: mlal2_1:
+; CHECK: smlal2.4s v0, v1, v2[3]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @mlal2_2(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
+; CHECK-LABEL: mlal2_2:
+; CHECK: smlal2.2d v0, v1, v2[1]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add
+}
+
+define <4 x i32> @mlal2_4(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c) nounwind {
+; CHECK-LABEL: mlal2_4:
+; CHECK: umlal2.4s v0, v1, v2[2]
+; CHECK-NEXT: ret
+
+  %shuffle = shufflevector <4 x i16> %c, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %tmp = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %tmp2 = bitcast <8 x i16> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <4 x i16>
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp3) nounwind
+  %add = add <4 x i32> %vmull2.i.i, %a
+  ret <4 x i32> %add
+}
+
+define <2 x i64> @mlal2_5(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c) nounwind {
+; CHECK-LABEL: mlal2_5:
+; CHECK: umlal2.2d v0, v1, v2[0]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <2 x i32> %c, <2 x i32> undef, <4 x i32> zeroinitializer
+  %tmp = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %tmp2 = bitcast <4 x i32> %shuffle to <2 x i64>
+  %shuffle.i3.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
+  %tmp3 = bitcast <1 x i64> %shuffle.i3.i to <2 x i32>
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp3) nounwind
+  %add = add <2 x i64> %vmull2.i.i, %a
+  ret <2 x i64> %add
+}
+
+; rdar://12328502
+define <2 x double> @vmulq_n_f64(<2 x double> %x, double %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmulq_n_f64:
+; CHECK-NOT: dup.2d
+; CHECK: fmul.2d v0, v0, v1[0]
+  %vecinit.i = insertelement <2 x double> undef, double %y, i32 0
+  %vecinit1.i = insertelement <2 x double> %vecinit.i, double %y, i32 1
+  %mul.i = fmul <2 x double> %vecinit1.i, %x
+  ret <2 x double> %mul.i
+}
+
+define <4 x float> @vmulq_n_f32(<4 x float> %x, float %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmulq_n_f32:
+; CHECK-NOT: dup.4s
+; CHECK: fmul.4s v0, v0, v1[0]
+  %vecinit.i = insertelement <4 x float> undef, float %y, i32 0
+  %vecinit1.i = insertelement <4 x float> %vecinit.i, float %y, i32 1
+  %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %y, i32 2
+  %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %y, i32 3
+  %mul.i = fmul <4 x float> %vecinit3.i, %x
+  ret <4 x float> %mul.i
+}
+
+define <2 x float> @vmul_n_f32(<2 x float> %x, float %y) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: vmul_n_f32:
+; CHECK-NOT: dup.2s
+; CHECK: fmul.2s v0, v0, v1[0]
+  %vecinit.i = insertelement <2 x float> undef, float %y, i32 0
+  %vecinit1.i = insertelement <2 x float> %vecinit.i, float %y, i32 1
+  %mul.i = fmul <2 x float> %vecinit1.i, %x
+  ret <2 x float> %mul.i
+}
+
+define <4 x i16> @vmla_laneq_s16_test(<4 x i16> %a, <4 x i16> %b, <8 x i16> %c) nounwind readnone ssp {
+entry:
+; CHECK: vmla_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: mla.4h v0, v1, v2[6]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <8 x i16> %c, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+  %mul = mul <4 x i16> %shuffle, %b
+  %add = add <4 x i16> %mul, %a
+  ret <4 x i16> %add
+}
+
+define <2 x i32> @vmla_laneq_s32_test(<2 x i32> %a, <2 x i32> %b, <4 x i32> %c) nounwind readnone ssp {
+entry:
+; CHECK: vmla_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: mla.2s v0, v1, v2[3]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i32> %c, <4 x i32> undef, <2 x i32> <i32 3, i32 3>
+  %mul = mul <2 x i32> %shuffle, %b
+  %add = add <2 x i32> %mul, %a
+  ret <2 x i32> %add
+}
+
+define <4 x i32> @vmull_laneq_s16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_s16_test
+; CHECK-NOT: ext
+; CHECK: smull.4s v0, v0, v1[6]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @vmull_laneq_s32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_s32_test
+; CHECK-NOT: ext
+; CHECK: smull.2d v0, v0, v1[2]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  ret <2 x i64> %vmull2.i
+}
+define <4 x i32> @vmull_laneq_u16_test(<4 x i16> %a, <8 x i16> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_u16_test
+; CHECK-NOT: ext
+; CHECK: umull.4s v0, v0, v1[6]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+  %vmull2.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %shuffle) #2
+  ret <4 x i32> %vmull2.i
+}
+
+define <2 x i64> @vmull_laneq_u32_test(<2 x i32> %a, <4 x i32> %b) nounwind readnone ssp {
+entry:
+; CHECK: vmull_laneq_u32_test
+; CHECK-NOT: ext
+; CHECK: umull.2d v0, v0, v1[2]
+; CHECK-NEXT: ret
+  %shuffle = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 2>
+  %vmull2.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %shuffle) #2
+  ret <2 x i64> %vmull2.i
+}
+
+define <4 x i32> @vmull_high_n_s16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_s16_test
+; CHECK-NOT: ext
+; CHECK: smull2.4s
+; CHECK-NEXT: ret
+  %conv = trunc i32 %d to i16
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @vmull_high_n_s32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_s32_test
+; CHECK-NOT: ext
+; CHECK: smull2.2d
+; CHECK-NEXT: ret
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.smull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @vmull_high_n_u16_test(<4 x i32> %a, <8 x i16> %b, <4 x i16> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_u16_test
+; CHECK-NOT: ext
+; CHECK: umull2.4s
+; CHECK-NEXT: ret
+  %conv = trunc i32 %d to i16
+  %0 = bitcast <8 x i16> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <4 x i16>
+  %vecinit.i = insertelement <4 x i16> undef, i16 %conv, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %conv, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %conv, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %conv, i32 3
+  %vmull2.i.i = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> %1, <4 x i16> %vecinit3.i) nounwind
+  ret <4 x i32> %vmull2.i.i
+}
+
+define <2 x i64> @vmull_high_n_u32_test(<2 x i64> %a, <4 x i32> %b, <2 x i32> %c, i32 %d) nounwind readnone optsize ssp {
+entry:
+; CHECK: vmull_high_n_u32_test
+; CHECK-NOT: ext
+; CHECK: umull2.2d
+; CHECK-NEXT: ret
+  %0 = bitcast <4 x i32> %b to <2 x i64>
+  %shuffle.i.i = shufflevector <2 x i64> %0, <2 x i64> undef, <1 x i32> <i32 1>
+  %1 = bitcast <1 x i64> %shuffle.i.i to <2 x i32>
+  %vecinit.i = insertelement <2 x i32> undef, i32 %d, i32 0
+  %vecinit1.i = insertelement <2 x i32> %vecinit.i, i32 %d, i32 1
+  %vmull2.i.i = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %1, <2 x i32> %vecinit1.i) nounwind
+  ret <2 x i64> %vmull2.i.i
+}
+
+define <4 x i32> @vmul_built_dup_test(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: vmul_built_dup_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.4s {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[1]
+  %vget_lane = extractelement <4 x i32> %b, i32 1
+  %vecinit.i = insertelement <4 x i32> undef, i32 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i32> %vecinit.i, i32 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i32> %vecinit1.i, i32 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i32> %vecinit2.i, i32 %vget_lane, i32 3
+  %prod = mul <4 x i32> %a, %vecinit3.i
+  ret <4 x i32> %prod
+}
+
+define <4 x i16> @vmul_built_dup_fromsmall_test(<4 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmul_built_dup_fromsmall_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.4h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[3]
+  %vget_lane = extractelement <4 x i16> %b, i32 3
+  %vecinit.i = insertelement <4 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <4 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <4 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <4 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %prod = mul <4 x i16> %a, %vecinit3.i
+  ret <4 x i16> %prod
+}
+
+define <8 x i16> @vmulq_built_dup_fromsmall_test(<8 x i16> %a, <4 x i16> %b) {
+; CHECK-LABEL: vmulq_built_dup_fromsmall_test:
+; CHECK-NOT: ins
+; CHECK-NOT: dup
+; CHECK: mul.8h {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}[0]
+  %vget_lane = extractelement <4 x i16> %b, i32 0
+  %vecinit.i = insertelement <8 x i16> undef, i16 %vget_lane, i32 0
+  %vecinit1.i = insertelement <8 x i16> %vecinit.i, i16 %vget_lane, i32 1
+  %vecinit2.i = insertelement <8 x i16> %vecinit1.i, i16 %vget_lane, i32 2
+  %vecinit3.i = insertelement <8 x i16> %vecinit2.i, i16 %vget_lane, i32 3
+  %vecinit4.i = insertelement <8 x i16> %vecinit3.i, i16 %vget_lane, i32 4
+  %vecinit5.i = insertelement <8 x i16> %vecinit4.i, i16 %vget_lane, i32 5
+  %vecinit6.i = insertelement <8 x i16> %vecinit5.i, i16 %vget_lane, i32 6
+  %vecinit7.i = insertelement <8 x i16> %vecinit6.i, i16 %vget_lane, i32 7
+  %prod = mul <8 x i16> %a, %vecinit7.i
+  ret <8 x i16> %prod
+}
+
+define <2 x i64> @mull_from_two_extracts(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: mull_from_two_extracts:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @mlal_from_two_extracts(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: mlal_from_two_extracts:
+; CHECK-NOT: ext
+; CHECK: sqdmlal2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  ret <2 x i64> %sum
+}
+
+define <2 x i64> @mull_from_extract_dup(<4 x i32> %lhs, i32 %rhs) {
+; CHECK-LABEL: mull_from_extract_dup:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
+  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+
+  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhsvec) nounwind
+  ret <2 x i64> %res
+}
+
+define <8 x i16> @pmull_from_extract_dup(<16 x i8> %lhs, i8 %rhs) {
+; CHECK-LABEL: pmull_from_extract_dup:
+; CHECK-NOT: ext
+; CHECK: pmull2.8h
+  %rhsvec.0 = insertelement <8 x i8> undef, i8 %rhs, i32 0
+  %rhsvec = shufflevector <8 x i8> %rhsvec.0, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+
+  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhsvec) nounwind
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @pmull_from_extract_duplane(<16 x i8> %lhs, <8 x i8> %rhs) {
+; CHECK-LABEL: pmull_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: pmull2.8h
+
+  %lhs.high = shufflevector <16 x i8> %lhs, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %rhs.high = shufflevector <8 x i8> %rhs, <8 x i8> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+
+  %res = tail call <8 x i16> @llvm.arm64.neon.pmull.v8i16(<8 x i8> %lhs.high, <8 x i8> %rhs.high) nounwind
+  ret <8 x i16> %res
+}
+
+define <2 x i64> @sqdmull_from_extract_duplane(<4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmull_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: sqdmull2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  ret <2 x i64> %res
+}
+
+define <2 x i64> @sqdmlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: sqdmlal_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: sqdmlal2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.arm64.neon.sqdmull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %accum, <2 x i64> %res)
+  ret <2 x i64> %sum
+}
+
+define <2 x i64> @umlal_from_extract_duplane(<2 x i64> %accum, <4 x i32> %lhs, <4 x i32> %rhs) {
+; CHECK-LABEL: umlal_from_extract_duplane:
+; CHECK-NOT: ext
+; CHECK: umlal2.2d
+
+  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 0>
+
+  %res = tail call <2 x i64> @llvm.arm64.neon.umull.v2i64(<2 x i32> %lhs.high, <2 x i32> %rhs.high) nounwind
+  %sum = add <2 x i64> %accum, %res
+  ret <2 x i64> %sum
+}
+
+define float @scalar_fmla_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v4f32:
+; CHECK: fmla.s s0, s1, v2[3]
+  %rhs = extractelement <4 x float> %rvec, i32 3
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+define float @scalar_fmla_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v2f32:
+; CHECK: fmla.s s0, s1, v2[1]
+  %rhs = extractelement <2 x float> %rvec, i32 1
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+define float @scalar_fmls_from_extract_v4f32(float %accum, float %lhs, <4 x float> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v4f32:
+; CHECK: fmls.s s0, s1, v2[3]
+  %rhs.scal = extractelement <4 x float> %rvec, i32 3
+  %rhs = fsub float -0.0, %rhs.scal
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+define float @scalar_fmls_from_extract_v2f32(float %accum, float %lhs, <2 x float> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v2f32:
+; CHECK: fmls.s s0, s1, v2[1]
+  %rhs.scal = extractelement <2 x float> %rvec, i32 1
+  %rhs = fsub float -0.0, %rhs.scal
+  %res = call float @llvm.fma.f32(float %lhs, float %rhs, float %accum)
+  ret float %res
+}
+
+declare float @llvm.fma.f32(float, float, float)
+
+define double @scalar_fmla_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
+; CHECK-LABEL: scalar_fmla_from_extract_v2f64:
+; CHECK: fmla.d d0, d1, v2[1]
+  %rhs = extractelement <2 x double> %rvec, i32 1
+  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
+  ret double %res
+}
+
+define double @scalar_fmls_from_extract_v2f64(double %accum, double %lhs, <2 x double> %rvec) {
+; CHECK-LABEL: scalar_fmls_from_extract_v2f64:
+; CHECK: fmls.d d0, d1, v2[1]
+  %rhs.scal = extractelement <2 x double> %rvec, i32 1
+  %rhs = fsub double -0.0, %rhs.scal
+  %res = call double @llvm.fma.f64(double %lhs, double %rhs, double %accum)
+  ret double %res
+}
+
+declare double @llvm.fma.f64(double, double, double)
+
+define <2 x float> @fmls_with_fneg_before_extract_v2f32(<2 x float> %accum, <2 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32:
+; CHECK: fmls.2s v0, v1, v2[3]
+  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <2 x i32> <i32 3, i32 3>
+  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
+  ret <2 x float> %res
+}
+
+define <2 x float> @fmls_with_fneg_before_extract_v2f32_1(<2 x float> %accum, <2 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f32_1:
+; CHECK: fmls.2s v0, v1, v2[1]
+  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <2 x i32> <i32 1, i32 1>
+  %res = call <2 x float> @llvm.fma.v2f32(<2 x float> %lhs, <2 x float> %splat, <2 x float> %accum)
+  ret <2 x float> %res
+}
+
+define <4 x float> @fmls_with_fneg_before_extract_v4f32(<4 x float> %accum, <4 x float> %lhs, <4 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32:
+; CHECK: fmls.4s v0, v1, v2[3]
+  %rhs_neg = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <4 x float> %rhs_neg, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
+  ret <4 x float> %res
+}
+
+define <4 x float> @fmls_with_fneg_before_extract_v4f32_1(<4 x float> %accum, <4 x float> %lhs, <2 x float> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v4f32_1:
+; CHECK: fmls.4s v0, v1, v2[1]
+  %rhs_neg = fsub <2 x float> <float -0.0, float -0.0>, %rhs
+  %splat = shufflevector <2 x float> %rhs_neg, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %res = call <4 x float> @llvm.fma.v4f32(<4 x float> %lhs, <4 x float> %splat, <4 x float> %accum)
+  ret <4 x float> %res
+}
+
+define <2 x double> @fmls_with_fneg_before_extract_v2f64(<2 x double> %accum, <2 x double> %lhs, <2 x double> %rhs) {
+; CHECK-LABEL: fmls_with_fneg_before_extract_v2f64:
+; CHECK: fmls.2d v0, v1, v2[1]
+  %rhs_neg = fsub <2 x double> <double -0.0, double -0.0>, %rhs
+  %splat = shufflevector <2 x double> %rhs_neg, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %res = call <2 x double> @llvm.fma.v2f64(<2 x double> %lhs, <2 x double> %splat, <2 x double> %accum)
+  ret <2 x double> %res
+}
+
+define <1 x double> @test_fmul_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
+; CHECK-LABEL: test_fmul_v1f64:
+; CHECK: fmul
+  %prod = fmul <1 x double> %L, %R
+  ret <1 x double> %prod
+}
+
+define <1 x double> @test_fdiv_v1f64(<1 x double> %L, <1 x double> %R) nounwind {
+; CHECK-LABEL: test_fdiv_v1f64:
+; CHECK-LABEL: fdiv
+  %prod = fdiv <1 x double> %L, %R
+  ret <1 x double> %prod
+}
diff --git a/test/CodeGen/ARM64/volatile.ll b/test/CodeGen/ARM64/volatile.ll
new file mode 100644
index 0000000000..e00ac5acb5
--- /dev/null
+++ b/test/CodeGen/ARM64/volatile.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+define i64 @normal_load(i64* nocapture %bar) nounwind readonly {
+; CHECK: normal_load
+; CHECK: ldp
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+  %add.ptr = getelementptr inbounds i64* %bar, i64 1
+  %tmp = load i64* %add.ptr, align 8
+  %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
+  %tmp1 = load i64* %add.ptr1, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  ret i64 %add
+}
+
+define i64 @volatile_load(i64* nocapture %bar) nounwind {
+; CHECK: volatile_load
+; CHECK: ldr
+; CHECK-NEXT: ldr
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+  %add.ptr = getelementptr inbounds i64* %bar, i64 1
+  %tmp = load volatile i64* %add.ptr, align 8
+  %add.ptr1 = getelementptr inbounds i64* %bar, i64 2
+  %tmp1 = load volatile i64* %add.ptr1, align 8
+  %add = add nsw i64 %tmp1, %tmp
+  ret i64 %add
+}
diff --git a/test/CodeGen/ARM64/vqadd.ll b/test/CodeGen/ARM64/vqadd.ll
new file mode 100644
index 0000000000..d6092be8ed
--- /dev/null
+++ b/test/CodeGen/ARM64/vqadd.ll
@@ -0,0 +1,300 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqadd8b:
+;CHECK: sqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqadd4h:
+;CHECK: sqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqadd2s:
+;CHECK: sqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqadd8b:
+;CHECK: uqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqadd4h:
+;CHECK: uqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqadd2s:
+;CHECK: uqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqadd16b:
+;CHECK: sqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqadd8h:
+;CHECK: sqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqadd4s:
+;CHECK: sqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqadd2d:
+;CHECK: sqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqadd16b:
+;CHECK: uqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqadd8h:
+;CHECK: uqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqadd4s:
+;CHECK: uqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqadd2d:
+;CHECK: uqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.sqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.uqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.uqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.sqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.sqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.uqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.uqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @usqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usqadd8b:
+;CHECK: usqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.usqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @usqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usqadd4h:
+;CHECK: usqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @usqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usqadd2s:
+;CHECK: usqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @usqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usqadd16b:
+;CHECK: usqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @usqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usqadd8h:
+;CHECK: usqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @usqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usqadd4s:
+;CHECK: usqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @usqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: usqadd2d:
+;CHECK: usqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.usqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.usqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.usqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.usqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.usqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.usqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.usqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.usqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @suqadd8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: suqadd8b:
+;CHECK: suqadd.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.suqadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @suqadd4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: suqadd4h:
+;CHECK: suqadd.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @suqadd2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: suqadd2s:
+;CHECK: suqadd.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @suqadd16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: suqadd16b:
+;CHECK: suqadd.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @suqadd8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: suqadd8h:
+;CHECK: suqadd.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @suqadd4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: suqadd4s:
+;CHECK: suqadd.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @suqadd2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: suqadd2d:
+;CHECK: suqadd.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <1 x i64> @suqadd_d(<1 x i64> %l, <1 x i64> %r) nounwind {
+; CHECK-LABEL: suqadd_d:
+; CHECK: suqadd {{d[0-9]+}}, {{d[0-9]+}}
+  %sum = call <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64> %l, <1 x i64> %r)
+  ret <1 x i64> %sum
+}
+
+declare <8 x i8>  @llvm.arm64.neon.suqadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.suqadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.suqadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.suqadd.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.suqadd.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.suqadd.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.suqadd.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.suqadd.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vqsub.ll b/test/CodeGen/ARM64/vqsub.ll
new file mode 100644
index 0000000000..0afeb68348
--- /dev/null
+++ b/test/CodeGen/ARM64/vqsub.ll
@@ -0,0 +1,147 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @sqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub8b:
+;CHECK: sqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.sqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub4h:
+;CHECK: sqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub2s:
+;CHECK: sqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqsub8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub8b:
+;CHECK: uqsub.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = call <8 x i8> @llvm.arm64.neon.uqsub.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+	ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqsub4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub4h:
+;CHECK: uqsub.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = call <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+	ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqsub2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub2s:
+;CHECK: uqsub.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+	ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqsub16b:
+;CHECK: sqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqsub8h:
+;CHECK: sqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqsub4s:
+;CHECK: sqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqsub2d:
+;CHECK: sqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqsub16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqsub16b:
+;CHECK: uqsub.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = call <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+	ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqsub8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqsub8h:
+;CHECK: uqsub.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = call <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+	ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqsub4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqsub4s:
+;CHECK: uqsub.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+	ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqsub2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqsub2d:
+;CHECK: uqsub.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = call <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+	ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.sqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.uqsub.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqsub.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqsub.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.uqsub.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.sqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.sqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.uqsub.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uqsub.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uqsub.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.uqsub.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
diff --git a/test/CodeGen/ARM64/vselect.ll b/test/CodeGen/ARM64/vselect.ll
new file mode 100644
index 0000000000..07274a0501
--- /dev/null
+++ b/test/CodeGen/ARM64/vselect.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+;CHECK: @func63
+;CHECK: cmeq.4h v0, v0, v1
+;CHECK: sshll.4s  v0, v0, #0
+;CHECK: bsl.16b v0, v2, v3
+;CHECK: str  q0, [x0]
+;CHECK: ret
+
+%T0_63 = type <4 x i16>
+%T1_63 = type <4 x i32>
+%T2_63 = type <4 x i1>
+define void @func63(%T1_63* %out, %T0_63 %v0, %T0_63 %v1, %T1_63 %v2, %T1_63 %v3) {
+  %cond = icmp eq %T0_63 %v0, %v1
+  %r = select %T2_63 %cond, %T1_63 %v2, %T1_63 %v3
+  store %T1_63 %r, %T1_63* %out
+  ret void
+}
diff --git a/test/CodeGen/ARM64/vsetcc_fp.ll b/test/CodeGen/ARM64/vsetcc_fp.ll
new file mode 100644
index 0000000000..c93aad5c4e
--- /dev/null
+++ b/test/CodeGen/ARM64/vsetcc_fp.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+define <2 x i32> @fcmp_one(<2 x float> %x, <2 x float> %y) nounwind optsize readnone {
+; CHECK-LABEL: fcmp_one:
+; CHECK-NEXT: fcmgt.2s [[REG:v[0-9]+]], v0, v1
+; CHECK-NEXT: fcmgt.2s [[REG2:v[0-9]+]], v1, v0
+; CHECK-NEXT: orr.8b v0, [[REG2]], [[REG]]
+; CHECK-NEXT: ret
+  %tmp = fcmp one <2 x float> %x, %y
+  %or = sext <2 x i1> %tmp to <2 x i32>
+  ret <2 x i32> %or
+}
diff --git a/test/CodeGen/ARM64/vshift.ll b/test/CodeGen/ARM64/vshift.ll
new file mode 100644
index 0000000000..ae5da38a22
--- /dev/null
+++ b/test/CodeGen/ARM64/vshift.ll
@@ -0,0 +1,1909 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple -enable-misched=false | FileCheck %s
+
+define <8 x i8> @sqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqshl8b:
+;CHECK: sqshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqshl4h:
+;CHECK: sqshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqshl2s:
+;CHECK: sqshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqshl8b:
+;CHECK: uqshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqshl4h:
+;CHECK: uqshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqshl2s:
+;CHECK: uqshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqshl16b:
+;CHECK: sqshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqshl8h:
+;CHECK: sqshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqshl4s:
+;CHECK: sqshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqshl2d:
+;CHECK: sqshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqshl16b:
+;CHECK: uqshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqshl8h:
+;CHECK: uqshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqshl4s:
+;CHECK: uqshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqshl2d:
+;CHECK: uqshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.sqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.uqshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.uqshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @srshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srshl8b:
+;CHECK: srshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @srshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srshl4h:
+;CHECK: srshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @srshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srshl2s:
+;CHECK: srshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @urshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: urshl8b:
+;CHECK: urshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @urshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: urshl4h:
+;CHECK: urshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @urshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: urshl2s:
+;CHECK: urshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @srshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srshl16b:
+;CHECK: srshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @srshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srshl8h:
+;CHECK: srshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @srshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srshl4s:
+;CHECK: srshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @srshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: srshl2d:
+;CHECK: srshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @urshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: urshl16b:
+;CHECK: urshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @urshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: urshl8h:
+;CHECK: urshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @urshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: urshl4s:
+;CHECK: urshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @urshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: urshl2d:
+;CHECK: urshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.srshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.srshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.urshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.urshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @sqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sqrshl8b:
+;CHECK: sqrshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrshl4h:
+;CHECK: sqrshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrshl2s:
+;CHECK: sqrshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <8 x i8> @uqrshl8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: uqrshl8b:
+;CHECK: uqrshl.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshl.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqrshl4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: uqrshl4h:
+;CHECK: uqrshl.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqrshl2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: uqrshl2s:
+;CHECK: uqrshl.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sqrshl16b:
+;CHECK: sqrshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sqrshl8h:
+;CHECK: sqrshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sqrshl4s:
+;CHECK: sqrshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sqrshl2d:
+;CHECK: sqrshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+define <16 x i8> @uqrshl16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: uqrshl16b:
+;CHECK: uqrshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqrshl8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: uqrshl8h:
+;CHECK: uqrshl.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqrshl4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: uqrshl4s:
+;CHECK: uqrshl.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqrshl2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: uqrshl2d:
+;CHECK: uqrshl.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.sqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <8 x i8>  @llvm.arm64.neon.uqrshl.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqrshl.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqrshl.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.uqrshl.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.sqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.sqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.uqrshl.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.uqrshl.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.uqrshl.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.uqrshl.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @urshr8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: urshr8b:
+;CHECK: urshr.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @urshr4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: urshr4h:
+;CHECK: urshr.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @urshr2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: urshr2s:
+;CHECK: urshr.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @urshr16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: urshr16b:
+;CHECK: urshr.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @urshr8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: urshr8h:
+;CHECK: urshr.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @urshr4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: urshr4s:
+;CHECK: urshr.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @urshr2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: urshr2d:
+;CHECK: urshr.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @srshr8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: srshr8b:
+;CHECK: srshr.8b
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @srshr4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: srshr4h:
+;CHECK: srshr.4h
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @srshr2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: srshr2s:
+;CHECK: srshr.2s
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @srshr16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: srshr16b:
+;CHECK: srshr.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @srshr8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: srshr8h:
+;CHECK: srshr.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @srshr4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: srshr4s:
+;CHECK: srshr.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @srshr2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: srshr2d:
+;CHECK: srshr.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @sqshlu8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshlu8b:
+;CHECK: sqshlu.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshlu.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshlu4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshlu4h:
+;CHECK: sqshlu.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshlu2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshlu2s:
+;CHECK: sqshlu.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshlu16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshlu16b:
+;CHECK: sqshlu.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshlu8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshlu8h:
+;CHECK: sqshlu.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshlu4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshlu4s:
+;CHECK: sqshlu.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshlu2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshlu2d:
+;CHECK: sqshlu.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.sqshlu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqshlu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqshlu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.sqshlu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.sqshlu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.sqshlu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.sqshlu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i8> @rshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: rshrn8b:
+;CHECK: rshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @rshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: rshrn4h:
+;CHECK: rshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @rshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: rshrn2s:
+;CHECK: rshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @rshrn16b(<8 x i8> *%ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: rshrn16b:
+;CHECK: rshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.rshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @rshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: rshrn8h:
+;CHECK: rshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @rshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: rshrn4s:
+;CHECK: rshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare <8 x i8>  @llvm.arm64.neon.rshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.rshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.rshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define <8 x i8> @shrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: shrn8b:
+;CHECK: shrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @shrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: shrn4h:
+;CHECK: shrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @shrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: shrn2s:
+;CHECK: shrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @shrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: shrn16b:
+;CHECK: shrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp3 = trunc <8 x i16> %tmp2 to <8 x i8>
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @shrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: shrn8h:
+;CHECK: shrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp3 = trunc <4 x i32> %tmp2 to <4 x i16>
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @shrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: shrn4s:
+;CHECK: shrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp3 = trunc <2 x i64> %tmp2 to <2 x i32>
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare <8 x i8>  @llvm.arm64.neon.shrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.shrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.shrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: sqshrn1s:
+; CHECK: sqshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.arm64.neon.sqshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrn8b:
+;CHECK: sqshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrn4h:
+;CHECK: sqshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrn2s:
+;CHECK: sqshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+
+define <16 x i8> @sqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrn16b:
+;CHECK: sqshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrn8h:
+;CHECK: sqshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrn4s:
+;CHECK: sqshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.arm64.neon.sqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.arm64.neon.sqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqshrun1s(i64 %A) nounwind {
+; CHECK-LABEL: sqshrun1s:
+; CHECK: sqshrun {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.arm64.neon.sqshrun.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqshrun8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrun8b:
+;CHECK: sqshrun.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshrun4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrun4h:
+;CHECK: sqshrun.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshrun2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrun2s:
+;CHECK: sqshrun.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshrun16b:
+;CHECK: sqshrun2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshrun8h:
+;CHECK: sqshrun2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshrun4s:
+;CHECK: sqshrun2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.arm64.neon.sqshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.arm64.neon.sqshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqshrun.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqrshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: sqrshrn1s:
+; CHECK: sqrshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.arm64.neon.sqrshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqrshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrn8b:
+;CHECK: sqrshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrn4h:
+;CHECK: sqrshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrn2s:
+;CHECK: sqrshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrn16b:
+;CHECK: sqrshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrn8h:
+;CHECK: sqrshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrn4s:
+;CHECK: sqrshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.arm64.neon.sqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.arm64.neon.sqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @sqrshrun1s(i64 %A) nounwind {
+; CHECK-LABEL: sqrshrun1s:
+; CHECK: sqrshrun {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.arm64.neon.sqrshrun.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @sqrshrun8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrun8b:
+;CHECK: sqrshrun.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqrshrun4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrun4h:
+;CHECK: sqrshrun.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqrshrun2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrun2s:
+;CHECK: sqrshrun.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqrshrun16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqrshrun16b:
+;CHECK: sqrshrun2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @sqrshrun8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqrshrun8h:
+;CHECK: sqrshrun2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @sqrshrun4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqrshrun4s:
+;CHECK: sqrshrun2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.arm64.neon.sqrshrun.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.arm64.neon.sqrshrun.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.sqrshrun.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.sqrshrun.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @uqrshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: uqrshrn1s:
+; CHECK: uqrshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.arm64.neon.uqrshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @uqrshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqrshrn8b:
+;CHECK: uqrshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqrshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqrshrn4h:
+;CHECK: uqrshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqrshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqrshrn2s:
+;CHECK: uqrshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqrshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqrshrn16b:
+;CHECK: uqrshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @uqrshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqrshrn8h:
+;CHECK: uqrshrn2.8h v0, {{v[0-9]+}}, #1
+        %out = load <4 x i16>* %ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uqrshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqrshrn4s:
+;CHECK: uqrshrn2.4s v0, {{v[0-9]+}}, #1
+        %out = load <2 x i32>* %ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+        ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.arm64.neon.uqrshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.arm64.neon.uqrshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqrshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqrshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define i32 @uqshrn1s(i64 %A) nounwind {
+; CHECK-LABEL: uqshrn1s:
+; CHECK: uqshrn {{s[0-9]+}}, d0, #1
+  %tmp = call i32 @llvm.arm64.neon.uqshrn.i32(i64 %A, i32 1)
+  ret i32 %tmp
+}
+
+define <8 x i8> @uqshrn8b(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshrn8b:
+;CHECK: uqshrn.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshrn4h(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshrn4h:
+;CHECK: uqshrn.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshrn2s(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshrn2s:
+;CHECK: uqshrn.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqshrn16b(<8 x i8>* %ret, <8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshrn16b:
+;CHECK: uqshrn2.16b v0, {{v[0-9]+}}, #1
+        %out = load <8 x i8>* %ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshrn.v8i8(<8 x i16> %tmp1, i32 1)
+        %tmp4 = shufflevector <8 x i8> %out, <8 x i8> %tmp3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @uqshrn8h(<4 x i16>* %ret, <4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshrn8h:
+;CHECK: uqshrn2.8h v0, {{v[0-9]+}}, #1
+  %out = load <4 x i16>* %ret
+  %tmp1 = load <4 x i32>* %A
+  %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32> %tmp1, i32 1)
+  %tmp4 = shufflevector <4 x i16> %out, <4 x i16> %tmp3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @uqshrn4s(<2 x i32>* %ret, <2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshrn4s:
+;CHECK: uqshrn2.4s v0, {{v[0-9]+}}, #1
+  %out = load <2 x i32>* %ret
+  %tmp1 = load <2 x i64>* %A
+  %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64> %tmp1, i32 1)
+  %tmp4 = shufflevector <2 x i32> %out, <2 x i32> %tmp3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %tmp4
+}
+
+declare i32  @llvm.arm64.neon.uqshrn.i32(i64, i32) nounwind readnone
+declare <8 x i8>  @llvm.arm64.neon.uqshrn.v8i8(<8 x i16>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.uqshrn.v4i16(<4 x i32>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.uqshrn.v2i32(<2 x i64>, i32) nounwind readnone
+
+define <8 x i16> @ushll8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: ushll8h:
+;CHECK: ushll.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ushll4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: ushll4s:
+;CHECK: ushll.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ushll2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: ushll2d:
+;CHECK: ushll.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @ushll2_8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: ushll2_8h:
+;CHECK: ushll2.8h v0, {{v[0-9]+}}, #1
+        %load1 = load <16 x i8>* %A
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = zext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @ushll2_4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: ushll2_4s:
+;CHECK: ushll2.4s v0, {{v[0-9]+}}, #1
+        %load1 = load <8 x i16>* %A
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = zext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @ushll2_2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: ushll2_2d:
+;CHECK: ushll2.2d v0, {{v[0-9]+}}, #1
+        %load1 = load <4 x i32>* %A
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = zext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @sshll8h(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sshll8h:
+;CHECK: sshll.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sshll4s(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sshll4s:
+;CHECK: sshll.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sshll2d(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sshll2d:
+;CHECK: sshll.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i16> @sshll2_8h(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sshll2_8h:
+;CHECK: sshll2.8h v0, {{v[0-9]+}}, #1
+        %load1 = load <16 x i8>* %A
+        %tmp1 = shufflevector <16 x i8> %load1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %tmp2 = sext <8 x i8> %tmp1 to <8 x i16>
+        %tmp3 = shl <8 x i16> %tmp2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sshll2_4s(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sshll2_4s:
+;CHECK: sshll2.4s v0, {{v[0-9]+}}, #1
+        %load1 = load <8 x i16>* %A
+        %tmp1 = shufflevector <8 x i16> %load1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %tmp2 = sext <4 x i16> %tmp1 to <4 x i32>
+        %tmp3 = shl <4 x i32> %tmp2, <i32 1, i32 1, i32 1, i32 1>
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sshll2_2d(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sshll2_2d:
+;CHECK: sshll2.2d v0, {{v[0-9]+}}, #1
+        %load1 = load <4 x i32>* %A
+        %tmp1 = shufflevector <4 x i32> %load1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %tmp2 = sext <2 x i32> %tmp1 to <2 x i64>
+        %tmp3 = shl <2 x i64> %tmp2, <i64 1, i64 1>
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @sqshli8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshli8b:
+;CHECK: sqshl.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.sqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sqshli4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshli4h:
+;CHECK: sqshl.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.sqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sqshli2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshli2s:
+;CHECK: sqshl.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.sqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @sqshli16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: sqshli16b:
+;CHECK: sqshl.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.sqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sqshli8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: sqshli8h:
+;CHECK: sqshl.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.sqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sqshli4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: sqshli4s:
+;CHECK: sqshl.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.sqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sqshli2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: sqshli2d:
+;CHECK: sqshl.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.sqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @uqshli8b(<8 x i8>* %A) nounwind {
+;CHECK-LABEL: uqshli8b:
+;CHECK: uqshl.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.uqshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @uqshli4h(<4 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshli4h:
+;CHECK: uqshl.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.uqshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 1, i16 1, i16 1, i16 1>)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @uqshli2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshli2s:
+;CHECK: uqshl.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.uqshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 1, i32 1>)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @uqshli16b(<16 x i8>* %A) nounwind {
+;CHECK-LABEL: uqshli16b:
+;CHECK: uqshl.16b
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.uqshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @uqshli8h(<8 x i16>* %A) nounwind {
+;CHECK-LABEL: uqshli8h:
+;CHECK: uqshl.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.uqshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @uqshli4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: uqshli4s:
+;CHECK: uqshl.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.uqshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 1, i32 1, i32 1, i32 1>)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @uqshli2d(<2 x i64>* %A) nounwind {
+;CHECK-LABEL: uqshli2d:
+;CHECK: uqshl.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.uqshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 1, i64 1>)
+        ret <2 x i64> %tmp3
+}
+
+define <8 x i8> @ursra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ursra8b:
+;CHECK: ursra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.urshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @ursra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ursra4h:
+;CHECK: ursra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.urshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @ursra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ursra2s:
+;CHECK: ursra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.urshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @ursra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ursra16b:
+;CHECK: ursra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.urshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @ursra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ursra8h:
+;CHECK: ursra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.urshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ursra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ursra4s:
+;CHECK: ursra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.urshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ursra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: ursra2d:
+;CHECK: ursra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.urshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @srsra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: srsra8b:
+;CHECK: srsra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.srshl.v8i8(<8 x i8> %tmp1, <8 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @srsra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: srsra4h:
+;CHECK: srsra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.srshl.v4i16(<4 x i16> %tmp1, <4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @srsra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: srsra2s:
+;CHECK: srsra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.srshl.v2i32(<2 x i32> %tmp1, <2 x i32> <i32 -1, i32 -1>)
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @srsra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: srsra16b:
+;CHECK: srsra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.srshl.v16i8(<16 x i8> %tmp1, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @srsra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: srsra8h:
+;CHECK: srsra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.srshl.v8i16(<8 x i16> %tmp1, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>)
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @srsra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: srsra4s:
+;CHECK: srsra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.srshl.v4i32(<4 x i32> %tmp1, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>)
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @srsra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: srsra2d:
+;CHECK: srsra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.srshl.v2i64(<2 x i64> %tmp1, <2 x i64> <i64 -1, i64 -1>)
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @usra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usra8b:
+;CHECK: usra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @usra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usra4h:
+;CHECK: usra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @usra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usra2s:
+;CHECK: usra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @usra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usra16b:
+;CHECK: usra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @usra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usra8h:
+;CHECK: usra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @usra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usra4s:
+;CHECK: usra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @usra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: usra2d:
+;CHECK: usra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @ssra8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssra8b:
+;CHECK: ssra.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp3 = ashr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <8 x i8>* %B
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @ssra4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssra4h:
+;CHECK: ssra.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp3 = ashr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <4 x i16>* %B
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @ssra2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssra2s:
+;CHECK: ssra.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp3 = ashr <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp4 = load <2 x i32>* %B
+        %tmp5 = add <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @ssra16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssra16b:
+;CHECK: ssra.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp3 = ashr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp4 = load <16 x i8>* %B
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @ssra8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssra8h:
+;CHECK: ssra.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp3 = ashr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp4 = load <8 x i16>* %B
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ssra4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssra4s:
+;CHECK: ssra.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp3 = ashr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp4 = load <4 x i32>* %B
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ssra2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: ssra2d:
+;CHECK: ssra.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp3 = ashr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp4 = load <2 x i64>* %B
+        %tmp5 = add <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @shr_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shr_orr8b:
+;CHECK: shr.8b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i8>* %A
+        %tmp4 = load <8 x i8>* %B
+        %tmp3 = lshr <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @shr_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shr_orr4h:
+;CHECK: shr.4h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i16>* %A
+        %tmp4 = load <4 x i16>* %B
+        %tmp3 = lshr <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @shr_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shr_orr2s:
+;CHECK: shr.2s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i32>* %A
+        %tmp4 = load <2 x i32>* %B
+        %tmp3 = lshr <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp5 = or <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @shr_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shr_orr16b:
+;CHECK: shr.16b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <16 x i8>* %A
+        %tmp4 = load <16 x i8>* %B
+        %tmp3 = lshr <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @shr_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shr_orr8h:
+;CHECK: shr.8h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp4 = load <8 x i16>* %B
+        %tmp3 = lshr <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @shr_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shr_orr4s:
+;CHECK: shr.4s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp4 = load <4 x i32>* %B
+        %tmp3 = lshr <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp5 = or <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @shr_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: shr_orr2d:
+;CHECK: shr.2d v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp4 = load <2 x i64>* %B
+        %tmp3 = lshr <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp5 = or <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i8> @shl_orr8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: shl_orr8b:
+;CHECK: shl.8b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i8>* %A
+        %tmp4 = load <8 x i8>* %B
+        %tmp3 = shl <8 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <8 x i8> %tmp3, %tmp4
+        ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @shl_orr4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: shl_orr4h:
+;CHECK: shl.4h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i16>* %A
+        %tmp4 = load <4 x i16>* %B
+        %tmp3 = shl <4 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <4 x i16> %tmp3, %tmp4
+        ret <4 x i16> %tmp5
+}
+
+define <2 x i32> @shl_orr2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: shl_orr2s:
+;CHECK: shl.2s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.8b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i32>* %A
+        %tmp4 = load <2 x i32>* %B
+        %tmp3 = shl <2 x i32> %tmp1, <i32 1, i32 1>
+        %tmp5 = or <2 x i32> %tmp3, %tmp4
+        ret <2 x i32> %tmp5
+}
+
+define <16 x i8> @shl_orr16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: shl_orr16b:
+;CHECK: shl.16b v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <16 x i8>* %A
+        %tmp4 = load <16 x i8>* %B
+        %tmp3 = shl <16 x i8> %tmp1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+        %tmp5 = or <16 x i8> %tmp3, %tmp4
+         ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @shl_orr8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: shl_orr8h:
+;CHECK: shl.8h v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <8 x i16>* %A
+        %tmp4 = load <8 x i16>* %B
+        %tmp3 = shl <8 x i16> %tmp1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+        %tmp5 = or <8 x i16> %tmp3, %tmp4
+         ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @shl_orr4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: shl_orr4s:
+;CHECK: shl.4s v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <4 x i32>* %A
+        %tmp4 = load <4 x i32>* %B
+        %tmp3 = shl <4 x i32> %tmp1, <i32 1, i32 1, i32 1, i32 1>
+        %tmp5 = or <4 x i32> %tmp3, %tmp4
+         ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @shl_orr2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: shl_orr2d:
+;CHECK: shl.2d v0, {{v[0-9]+}}, #1
+;CHECK-NEXT: orr.16b
+;CHECK-NEXT: ret
+        %tmp1 = load <2 x i64>* %A
+        %tmp4 = load <2 x i64>* %B
+        %tmp3 = shl <2 x i64> %tmp1, <i64 1, i64 1>
+        %tmp5 = or <2 x i64> %tmp3, %tmp4
+         ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @shll(<8 x i8> %in) {
+; CHECK-LABEL: shll:
+; CHECK: shll.8h v0, {{v[0-9]+}}, #8
+  %ext = zext <8 x i8> %in to <8 x i16>
+  %res = shl <8 x i16> %ext, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @shll_high(<8 x i16> %in) {
+; CHECK-LABEL: shll_high
+; CHECK: shll2.4s v0, {{v[0-9]+}}, #16
+  %extract = shufflevector <8 x i16> %in, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ext = zext <4 x i16> %extract to <4 x i32>
+  %res = shl <4 x i32> %ext, <i32 16, i32 16, i32 16, i32 16>
+  ret <4 x i32> %res
+}
+
+define <8 x i8> @sli8b(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: sli8b:
+;CHECK: sli.8b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.vsli.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2, i32 1)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @sli4h(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: sli4h:
+;CHECK: sli.4h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2, i32 1)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @sli2s(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: sli2s:
+;CHECK: sli.2s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2, i32 1)
+        ret <2 x i32> %tmp3
+}
+
+define <1 x i64> @sli1d(<1 x i64>* %A, <1 x i64>* %B) nounwind {
+;CHECK-LABEL: sli1d:
+;CHECK: sli d0, {{d[0-9]+}}, #1
+        %tmp1 = load <1 x i64>* %A
+        %tmp2 = load <1 x i64>* %B
+        %tmp3 = call <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2, i32 1)
+        ret <1 x i64> %tmp3
+}
+
+define <16 x i8> @sli16b(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: sli16b:
+;CHECK: sli.16b v0, {{v[0-9]+}}, #1
+        %tmp1 = load <16 x i8>* %A
+        %tmp2 = load <16 x i8>* %B
+        %tmp3 = call <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2, i32 1)
+        ret <16 x i8> %tmp3
+}
+
+define <8 x i16> @sli8h(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: sli8h:
+;CHECK: sli.8h v0, {{v[0-9]+}}, #1
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2, i32 1)
+        ret <8 x i16> %tmp3
+}
+
+define <4 x i32> @sli4s(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: sli4s:
+;CHECK: sli.4s v0, {{v[0-9]+}}, #1
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2, i32 1)
+        ret <4 x i32> %tmp3
+}
+
+define <2 x i64> @sli2d(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: sli2d:
+;CHECK: sli.2d v0, {{v[0-9]+}}, #1
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2, i32 1)
+        ret <2 x i64> %tmp3
+}
+
+declare <8 x i8>  @llvm.arm64.neon.vsli.v8i8(<8 x i8>, <8 x i8>, i32) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.vsli.v4i16(<4 x i16>, <4 x i16>, i32) nounwind readnone
+declare <2 x i32> @llvm.arm64.neon.vsli.v2i32(<2 x i32>, <2 x i32>, i32) nounwind readnone
+declare <1 x i64> @llvm.arm64.neon.vsli.v1i64(<1 x i64>, <1 x i64>, i32) nounwind readnone
+
+declare <16 x i8> @llvm.arm64.neon.vsli.v16i8(<16 x i8>, <16 x i8>, i32) nounwind readnone
+declare <8 x i16> @llvm.arm64.neon.vsli.v8i16(<8 x i16>, <8 x i16>, i32) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.vsli.v4i32(<4 x i32>, <4 x i32>, i32) nounwind readnone
+declare <2 x i64> @llvm.arm64.neon.vsli.v2i64(<2 x i64>, <2 x i64>, i32) nounwind readnone
diff --git a/test/CodeGen/ARM64/vshr.ll b/test/CodeGen/ARM64/vshr.ll
new file mode 100644
index 0000000000..2c02cc1473
--- /dev/null
+++ b/test/CodeGen/ARM64/vshr.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=arm64 -arm64-neon-syntax=apple < %s | FileCheck %s
+
+define <8 x i16> @testShiftRightArith_v8i16(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK-LABEL: testShiftRightArith_v8i16:
+; CHECK: neg.8h	[[REG1:v[0-9]+]], [[REG1]]
+; CHECK-NEXT: sshl.8h [[REG2:v[0-9]+]], [[REG2]], [[REG1]]
+
+entry:
+  %a.addr = alloca <8 x i16>, align 16
+  %b.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %a, <8 x i16>* %a.addr, align 16
+  store <8 x i16> %b, <8 x i16>* %b.addr, align 16
+  %0 = load <8 x i16>* %a.addr, align 16
+  %1 = load <8 x i16>* %b.addr, align 16
+  %shr = ashr <8 x i16> %0, %1
+  ret <8 x i16> %shr
+}
+
+define <4 x i32> @testShiftRightArith_v4i32(<4 x i32> %a, <4 x i32> %b) #0 {
+; CHECK-LABEL: testShiftRightArith_v4i32:
+; CHECK: neg.4s	[[REG3:v[0-9]+]], [[REG3]]
+; CHECK-NEXT: sshl.4s [[REG4:v[0-9]+]], [[REG4]], [[REG3]]
+entry:
+  %a.addr = alloca <4 x i32>, align 32
+  %b.addr = alloca <4 x i32>, align 32
+  store <4 x i32> %a, <4 x i32>* %a.addr, align 32
+  store <4 x i32> %b, <4 x i32>* %b.addr, align 32
+  %0 = load <4 x i32>* %a.addr, align 32
+  %1 = load <4 x i32>* %b.addr, align 32
+  %shr = ashr <4 x i32> %0, %1
+  ret <4 x i32> %shr
+}
+
+define <8 x i16> @testShiftRightLogical(<8 x i16> %a, <8 x i16> %b) #0 {
+; CHECK: testShiftRightLogical
+; CHECK: neg.8h	[[REG5:v[0-9]+]], [[REG5]]
+; CHECK-NEXT: ushl.8h [[REG6:v[0-9]+]], [[REG6]], [[REG5]]
+entry:
+  %a.addr = alloca <8 x i16>, align 16
+  %b.addr = alloca <8 x i16>, align 16
+  store <8 x i16> %a, <8 x i16>* %a.addr, align 16
+  store <8 x i16> %b, <8 x i16>* %b.addr, align 16
+  %0 = load <8 x i16>* %a.addr, align 16
+  %1 = load <8 x i16>* %b.addr, align 16
+  %shr = lshr <8 x i16> %0, %1
+  ret <8 x i16> %shr
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/ARM64/vshuffle.ll b/test/CodeGen/ARM64/vshuffle.ll
new file mode 100644
index 0000000000..f90200cfac
--- /dev/null
+++ b/test/CodeGen/ARM64/vshuffle.ll
@@ -0,0 +1,115 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
+
+
+; The mask:
+; CHECK: lCPI0_0:
+; CHECK:  .byte   2                       ; 0x2
+; CHECK:  .byte   255                     ; 0xff
+; CHECK:  .byte   6                       ; 0x6
+; CHECK:  .byte   255                     ; 0xff
+; The second vector is legalized to undef and the elements of the first vector
+; are used instead.
+; CHECK:  .byte   2                       ; 0x2
+; CHECK:  .byte   4                       ; 0x4
+; CHECK:  .byte   6                       ; 0x6
+; CHECK:  .byte   0                       ; 0x0
+; CHECK: test1
+; CHECK: ldr d[[REG0:[0-9]+]], [{{.*}}, lCPI0_0
+; CHECK: movi.8h v[[REG1:[0-9]+]], #1, lsl #8
+; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <8 x i1> @test1() {
+entry:
+  %Shuff = shufflevector <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
+                                   i1 7>,
+                         <8 x i1> <i1 0, i1 1, i1 2, i1 3, i1 4, i1 5, i1 6,
+                                   i1 7>,
+                         <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10,
+                                    i32 12, i32 14, i32 0>
+  ret <8 x i1> %Shuff
+}
+
+; CHECK: lCPI1_0:
+; CHECK:          .byte   2                       ; 0x2
+; CHECK:          .byte   255                     ; 0xff
+; CHECK:          .byte   6                       ; 0x6
+; CHECK:          .byte   255                     ; 0xff
+; CHECK:          .byte   10                      ; 0xa
+; CHECK:          .byte   12                      ; 0xc
+; CHECK:          .byte   14                      ; 0xe
+; CHECK:          .byte   0                       ; 0x0
+; CHECK: test2
+; CHECK: ldr     d[[REG0:[0-9]+]], [{{.*}}, lCPI1_0@PAGEOFF]
+; CHECK: adrp    x[[REG2:[0-9]+]], lCPI1_1@PAGE
+; CHECK: ldr     q[[REG1:[0-9]+]], [x[[REG2]], lCPI1_1@PAGEOFF]
+; CHECK: tbl.8b  v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <8 x i1>@test2() {
+bb:
+  %Shuff = shufflevector <8 x i1> zeroinitializer,
+     <8 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
+     <8 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
+                i32 0>
+  ret <8 x i1> %Shuff
+}
+
+; CHECK: lCPI2_0:
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   255                     ; 0xff
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK: test3
+; CHECK: adrp    x[[REG3:[0-9]+]], lCPI2_0@PAGE
+; CHECK: ldr     q[[REG0:[0-9]+]], [x[[REG3]], lCPI2_0@PAGEOFF]
+; CHECK: movi.2d v[[REG1:[0-9]+]], #0000000000000000
+; CHECK: tbl.16b v{{[0-9]+}}, { v[[REG1]] }, v[[REG0]]
+define <16 x i1> @test3(i1* %ptr, i32 %v) {
+bb:
+  %Shuff = shufflevector <16 x i1> zeroinitializer, <16 x i1> undef,
+     <16 x i32> <i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12, i32 14,
+                 i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 10, i32 12,
+                 i32 14, i32 0>
+  ret <16 x i1> %Shuff
+}
+; CHECK: lCPI3_1:
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   1                       ; 0x1
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   18                      ; 0x12
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK:         .byte   2                       ; 0x2
+; CHECK:         .byte   31                      ; 0x1f
+; CHECK:         .byte   6                       ; 0x6
+; CHECK:         .byte   30                      ; 0x1e
+; CHECK:         .byte   10                      ; 0xa
+; CHECK:         .byte   12                      ; 0xc
+; CHECK:         .byte   14                      ; 0xe
+; CHECK:         .byte   0                       ; 0x0
+; CHECK: _test4:
+; CHECK:         ldr     q[[REG1:[0-9]+]]
+; CHECK:         movi.2d v[[REG0:[0-9]+]], #0000000000000000
+; CHECK:         adrp    x[[REG3:[0-9]+]], lCPI3_1@PAGE
+; CHECK:         ldr     q[[REG2:[0-9]+]], [x[[REG3]], lCPI3_1@PAGEOFF]
+; CHECK:         tbl.16b v{{[0-9]+}}, { v[[REG0]], v[[REG1]] }, v[[REG2]]
+define <16 x i1> @test4(i1* %ptr, i32 %v) {
+bb:
+  %Shuff = shufflevector <16 x i1> zeroinitializer,
+     <16 x i1> <i1 0, i1 1, i1 1, i1 0, i1 0, i1 1, i1 0, i1 0, i1 0, i1 1,
+                i1 1, i1 0, i1 0, i1 1, i1 0, i1 0>,
+     <16 x i32> <i32 2, i32 1, i32 6, i32 18, i32 10, i32 12, i32 14, i32 0,
+                 i32 2, i32 31, i32 6, i32 30, i32 10, i32 12, i32 14, i32 0>
+  ret <16 x i1> %Shuff
+}
diff --git a/test/CodeGen/ARM64/vsqrt.ll b/test/CodeGen/ARM64/vsqrt.ll
new file mode 100644
index 0000000000..f4f56f4b30
--- /dev/null
+++ b/test/CodeGen/ARM64/vsqrt.ll
@@ -0,0 +1,177 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <2 x float> @frecps_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: frecps_2s:
+;CHECK: frecps.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frecps_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: frecps_4s:
+;CHECK: frecps.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frecps_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: frecps_2d:
+;CHECK: frecps.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.frecps.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.frecps.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.frecps.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+
+define <2 x float> @frsqrts_2s(<2 x float>* %A, <2 x float>* %B) nounwind {
+;CHECK-LABEL: frsqrts_2s:
+;CHECK: frsqrts.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp2 = load <2 x float>* %B
+	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frsqrts_4s(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: frsqrts_4s:
+;CHECK: frsqrts.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frsqrts_2d(<2 x double>* %A, <2 x double>* %B) nounwind {
+;CHECK-LABEL: frsqrts_2d:
+;CHECK: frsqrts.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp2 = load <2 x double>* %B
+	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.frsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.frsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.frsqrts.v2f64(<2 x double>, <2 x double>) nounwind readnone
+
+define <2 x float> @frecpe_2s(<2 x float>* %A) nounwind {
+;CHECK-LABEL: frecpe_2s:
+;CHECK: frecpe.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp3 = call <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frecpe_4s(<4 x float>* %A) nounwind {
+;CHECK-LABEL: frecpe_4s:
+;CHECK: frecpe.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp3 = call <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frecpe_2d(<2 x double>* %A) nounwind {
+;CHECK-LABEL: frecpe_2d:
+;CHECK: frecpe.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp3 = call <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double> %tmp1)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.frecpe.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.frecpe.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.frecpe.v2f64(<2 x double>) nounwind readnone
+
+define <2 x float> @frsqrte_2s(<2 x float>* %A) nounwind {
+;CHECK-LABEL: frsqrte_2s:
+;CHECK: frsqrte.2s
+	%tmp1 = load <2 x float>* %A
+	%tmp3 = call <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float> %tmp1)
+	ret <2 x float> %tmp3
+}
+
+define <4 x float> @frsqrte_4s(<4 x float>* %A) nounwind {
+;CHECK-LABEL: frsqrte_4s:
+;CHECK: frsqrte.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp3 = call <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float> %tmp1)
+	ret <4 x float> %tmp3
+}
+
+define <2 x double> @frsqrte_2d(<2 x double>* %A) nounwind {
+;CHECK-LABEL: frsqrte_2d:
+;CHECK: frsqrte.2d
+	%tmp1 = load <2 x double>* %A
+	%tmp3 = call <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double> %tmp1)
+	ret <2 x double> %tmp3
+}
+
+declare <2 x float> @llvm.arm64.neon.frsqrte.v2f32(<2 x float>) nounwind readnone
+declare <4 x float> @llvm.arm64.neon.frsqrte.v4f32(<4 x float>) nounwind readnone
+declare <2 x double> @llvm.arm64.neon.frsqrte.v2f64(<2 x double>) nounwind readnone
+
+define <2 x i32> @urecpe_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: urecpe_2s:
+;CHECK: urecpe.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @urecpe_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: urecpe_4s:
+;CHECK: urecpe.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.urecpe.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.urecpe.v4i32(<4 x i32>) nounwind readnone
+
+define <2 x i32> @ursqrte_2s(<2 x i32>* %A) nounwind {
+;CHECK-LABEL: ursqrte_2s:
+;CHECK: ursqrte.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp3 = call <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32> %tmp1)
+	ret <2 x i32> %tmp3
+}
+
+define <4 x i32> @ursqrte_4s(<4 x i32>* %A) nounwind {
+;CHECK-LABEL: ursqrte_4s:
+;CHECK: ursqrte.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp3 = call <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32> %tmp1)
+	ret <4 x i32> %tmp3
+}
+
+declare <2 x i32> @llvm.arm64.neon.ursqrte.v2i32(<2 x i32>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.ursqrte.v4i32(<4 x i32>) nounwind readnone
+
+define float @f1(float %a, float %b) nounwind readnone optsize ssp {
+; CHECK-LABEL: f1:
+; CHECK: frsqrts s0, s0, s1
+; CHECK-NEXT: ret
+  %vrsqrtss.i = tail call float @llvm.arm64.neon.frsqrts.f32(float %a, float %b) nounwind
+  ret float %vrsqrtss.i
+}
+
+define double @f2(double %a, double %b) nounwind readnone optsize ssp {
+; CHECK-LABEL: f2:
+; CHECK: frsqrts d0, d0, d1
+; CHECK-NEXT: ret
+  %vrsqrtsd.i = tail call double @llvm.arm64.neon.frsqrts.f64(double %a, double %b) nounwind
+  ret double %vrsqrtsd.i
+}
+
+declare double @llvm.arm64.neon.frsqrts.f64(double, double) nounwind readnone
+declare float @llvm.arm64.neon.frsqrts.f32(float, float) nounwind readnone
diff --git a/test/CodeGen/ARM64/vsra.ll b/test/CodeGen/ARM64/vsra.ll
new file mode 100644
index 0000000000..a21b616e17
--- /dev/null
+++ b/test/CodeGen/ARM64/vsra.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vsras8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vsras8:
+;CHECK: ssra.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = ashr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsras16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vsras16:
+;CHECK: ssra.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = ashr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsras32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vsras32:
+;CHECK: ssra.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = ashr <2 x i32> %tmp2, < i32 31, i32 31 >
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+define <16 x i8> @vsraQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vsraQs8:
+;CHECK: ssra.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = ashr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vsraQs16:
+;CHECK: ssra.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = ashr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vsraQs32:
+;CHECK: ssra.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = ashr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vsraQs64:
+;CHECK: ssra.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = ashr <2 x i64> %tmp2, < i64 63, i64 63 >
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
+
+define <8 x i8> @vsrau8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vsrau8:
+;CHECK: usra.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = lshr <8 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <8 x i8> %tmp1, %tmp3
+	ret <8 x i8> %tmp4
+}
+
+define <4 x i16> @vsrau16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vsrau16:
+;CHECK: usra.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = lshr <4 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <4 x i16> %tmp1, %tmp3
+	ret <4 x i16> %tmp4
+}
+
+define <2 x i32> @vsrau32(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: vsrau32:
+;CHECK: usra.2s
+	%tmp1 = load <2 x i32>* %A
+	%tmp2 = load <2 x i32>* %B
+	%tmp3 = lshr <2 x i32> %tmp2, < i32 31, i32 31 >
+        %tmp4 = add <2 x i32> %tmp1, %tmp3
+	ret <2 x i32> %tmp4
+}
+
+
+define <16 x i8> @vsraQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vsraQu8:
+;CHECK: usra.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = lshr <16 x i8> %tmp2, < i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7 >
+        %tmp4 = add <16 x i8> %tmp1, %tmp3
+	ret <16 x i8> %tmp4
+}
+
+define <8 x i16> @vsraQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vsraQu16:
+;CHECK: usra.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = lshr <8 x i16> %tmp2, < i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15 >
+        %tmp4 = add <8 x i16> %tmp1, %tmp3
+	ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @vsraQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vsraQu32:
+;CHECK: usra.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = lshr <4 x i32> %tmp2, < i32 31, i32 31, i32 31, i32 31 >
+        %tmp4 = add <4 x i32> %tmp1, %tmp3
+	ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @vsraQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: vsraQu64:
+;CHECK: usra.2d
+	%tmp1 = load <2 x i64>* %A
+	%tmp2 = load <2 x i64>* %B
+	%tmp3 = lshr <2 x i64> %tmp2, < i64 63, i64 63 >
+        %tmp4 = add <2 x i64> %tmp1, %tmp3
+	ret <2 x i64> %tmp4
+}
diff --git a/test/CodeGen/ARM64/vsub.ll b/test/CodeGen/ARM64/vsub.ll
new file mode 100644
index 0000000000..5c7e84f46e
--- /dev/null
+++ b/test/CodeGen/ARM64/vsub.ll
@@ -0,0 +1,417 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @subhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: subhn8b:
+;CHECK: subhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @subhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: subhn4h:
+;CHECK: subhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @subhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: subhn2s:
+;CHECK: subhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @subhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
+;CHECK-LABEL: subhn2_16b:
+;CHECK: subhn.8b
+;CHECK-NEXT: subhn2.16b
+  %vsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vsubhn2.i, <8 x i8> %vsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @subhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
+;CHECK-LABEL: subhn2_8h:
+;CHECK: subhn.4h
+;CHECK-NEXT: subhn2.8h
+  %vsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vsubhn2.i, <4 x i16> %vsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @subhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
+;CHECK-LABEL: subhn2_4s:
+;CHECK: subhn.2s
+;CHECK-NEXT: subhn2.4s
+  %vsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vsubhn2.i, <2 x i32> %vsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.arm64.neon.subhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.subhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.subhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i8> @rsubhn8b(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: rsubhn8b:
+;CHECK: rsubhn.8b
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i16>* %B
+        %tmp3 = call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
+        ret <8 x i8> %tmp3
+}
+
+define <4 x i16> @rsubhn4h(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: rsubhn4h:
+;CHECK: rsubhn.4h
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i32>* %B
+        %tmp3 = call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
+        ret <4 x i16> %tmp3
+}
+
+define <2 x i32> @rsubhn2s(<2 x i64>* %A, <2 x i64>* %B) nounwind {
+;CHECK-LABEL: rsubhn2s:
+;CHECK: rsubhn.2s
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i64>* %B
+        %tmp3 = call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
+        ret <2 x i32> %tmp3
+}
+
+define <16 x i8> @rsubhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind  {
+;CHECK-LABEL: rsubhn2_16b:
+;CHECK: rsubhn.8b
+;CHECK-NEXT: rsubhn2.16b
+  %vrsubhn2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %vrsubhn_high2.i = tail call <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
+  %res = shufflevector <8 x i8> %vrsubhn2.i, <8 x i8> %vrsubhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret <16 x i8> %res
+}
+
+define <8 x i16> @rsubhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind  {
+;CHECK-LABEL: rsubhn2_8h:
+;CHECK: rsubhn.4h
+;CHECK-NEXT: rsubhn2.8h
+  %vrsubhn2.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %vrsubhn_high3.i = tail call <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
+  %res = shufflevector <4 x i16> %vrsubhn2.i, <4 x i16> %vrsubhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @rsubhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind  {
+;CHECK-LABEL: rsubhn2_4s:
+;CHECK: rsubhn.2s
+;CHECK-NEXT: rsubhn2.4s
+  %vrsubhn2.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %vrsubhn_high3.i = tail call <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
+  %res = shufflevector <2 x i32> %vrsubhn2.i, <2 x i32> %vrsubhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+declare <2 x i32> @llvm.arm64.neon.rsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
+declare <4 x i16> @llvm.arm64.neon.rsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i8> @llvm.arm64.neon.rsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <8 x i16> @ssubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubl8h:
+;CHECK: ssubl.8h
+        %tmp1 = load <8 x i8>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sub <8 x i16> %tmp3, %tmp4
+        ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @ssubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubl4s:
+;CHECK: ssubl.4s
+        %tmp1 = load <4 x i16>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+        ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @ssubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubl2d:
+;CHECK: ssubl.2d
+        %tmp1 = load <2 x i32>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+        ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @ssubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubl2_8h:
+;CHECK: ssubl2.8h
+        %tmp1 = load <16 x i8>* %A
+        %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext1 = sext <8 x i8> %high1 to <8 x i16>
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+        %res = sub <8 x i16> %ext1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @ssubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubl2_4s:
+;CHECK: ssubl2.4s
+        %tmp1 = load <8 x i16>* %A
+        %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext1 = sext <4 x i16> %high1 to <4 x i32>
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+        %res = sub <4 x i32> %ext1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @ssubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubl2_2d:
+;CHECK: ssubl2.2d
+        %tmp1 = load <4 x i32>* %A
+        %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext1 = sext <2 x i32> %high1 to <2 x i64>
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+        %res = sub <2 x i64> %ext1, %ext2
+        ret <2 x i64> %res
+}
+
+define <8 x i16> @usubl8h(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usubl8h:
+;CHECK: usubl.8h
+  %tmp1 = load <8 x i8>* %A
+  %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
+  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp5 = sub <8 x i16> %tmp3, %tmp4
+  ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @usubl4s(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usubl4s:
+;CHECK: usubl.4s
+  %tmp1 = load <4 x i16>* %A
+  %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
+  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp5 = sub <4 x i32> %tmp3, %tmp4
+  ret <4 x i32> %tmp5
+}
+
+define <2 x i64> @usubl2d(<2 x i32>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usubl2d:
+;CHECK: usubl.2d
+  %tmp1 = load <2 x i32>* %A
+  %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
+  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp5 = sub <2 x i64> %tmp3, %tmp4
+  ret <2 x i64> %tmp5
+}
+
+define <8 x i16> @usubl2_8h(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usubl2_8h:
+;CHECK: usubl2.8h
+  %tmp1 = load <16 x i8>* %A
+  %high1 = shufflevector <16 x i8> %tmp1, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ext1 = zext <8 x i8> %high1 to <8 x i16>
+
+  %tmp2 = load <16 x i8>* %B
+  %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+  %res = sub <8 x i16> %ext1, %ext2
+  ret <8 x i16> %res
+}
+
+define <4 x i32> @usubl2_4s(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usubl2_4s:
+;CHECK: usubl2.4s
+  %tmp1 = load <8 x i16>* %A
+  %high1 = shufflevector <8 x i16> %tmp1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ext1 = zext <4 x i16> %high1 to <4 x i32>
+
+  %tmp2 = load <8 x i16>* %B
+  %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+  %res = sub <4 x i32> %ext1, %ext2
+  ret <4 x i32> %res
+}
+
+define <2 x i64> @usubl2_2d(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usubl2_2d:
+;CHECK: usubl2.2d
+  %tmp1 = load <4 x i32>* %A
+  %high1 = shufflevector <4 x i32> %tmp1, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %ext1 = zext <2 x i32> %high1 to <2 x i64>
+
+  %tmp2 = load <4 x i32>* %B
+  %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+  %res = sub <2 x i64> %ext1, %ext2
+  ret <2 x i64> %res
+}
+
+define <8 x i16> @ssubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubw8h:
+;CHECK: ssubw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
+  %tmp4 = sub <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @ssubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubw4s:
+;CHECK: ssubw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
+  %tmp4 = sub <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @ssubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubw2d:
+;CHECK: ssubw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
+  %tmp4 = sub <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @ssubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: ssubw2_8h:
+;CHECK: ssubw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = sext <8 x i8> %high2 to <8 x i16>
+
+        %res = sub <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @ssubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: ssubw2_4s:
+;CHECK: ssubw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = sext <4 x i16> %high2 to <4 x i32>
+
+        %res = sub <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @ssubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: ssubw2_2d:
+;CHECK: ssubw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = sext <2 x i32> %high2 to <2 x i64>
+
+        %res = sub <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
+
+define <8 x i16> @usubw8h(<8 x i16>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: usubw8h:
+;CHECK: usubw.8h
+        %tmp1 = load <8 x i16>* %A
+        %tmp2 = load <8 x i8>* %B
+  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
+  %tmp4 = sub <8 x i16> %tmp1, %tmp3
+        ret <8 x i16> %tmp4
+}
+
+define <4 x i32> @usubw4s(<4 x i32>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: usubw4s:
+;CHECK: usubw.4s
+        %tmp1 = load <4 x i32>* %A
+        %tmp2 = load <4 x i16>* %B
+  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
+  %tmp4 = sub <4 x i32> %tmp1, %tmp3
+        ret <4 x i32> %tmp4
+}
+
+define <2 x i64> @usubw2d(<2 x i64>* %A, <2 x i32>* %B) nounwind {
+;CHECK-LABEL: usubw2d:
+;CHECK: usubw.2d
+        %tmp1 = load <2 x i64>* %A
+        %tmp2 = load <2 x i32>* %B
+  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
+  %tmp4 = sub <2 x i64> %tmp1, %tmp3
+        ret <2 x i64> %tmp4
+}
+
+define <8 x i16> @usubw2_8h(<8 x i16>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: usubw2_8h:
+;CHECK: usubw2.8h
+        %tmp1 = load <8 x i16>* %A
+
+        %tmp2 = load <16 x i8>* %B
+        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+        %ext2 = zext <8 x i8> %high2 to <8 x i16>
+
+        %res = sub <8 x i16> %tmp1, %ext2
+        ret <8 x i16> %res
+}
+
+define <4 x i32> @usubw2_4s(<4 x i32>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: usubw2_4s:
+;CHECK: usubw2.4s
+        %tmp1 = load <4 x i32>* %A
+
+        %tmp2 = load <8 x i16>* %B
+        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+        %ext2 = zext <4 x i16> %high2 to <4 x i32>
+
+        %res = sub <4 x i32> %tmp1, %ext2
+        ret <4 x i32> %res
+}
+
+define <2 x i64> @usubw2_2d(<2 x i64>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: usubw2_2d:
+;CHECK: usubw2.2d
+        %tmp1 = load <2 x i64>* %A
+
+        %tmp2 = load <4 x i32>* %B
+        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+        %ext2 = zext <2 x i32> %high2 to <2 x i64>
+
+        %res = sub <2 x i64> %tmp1, %ext2
+        ret <2 x i64> %res
+}
diff --git a/test/CodeGen/ARM64/weak-reference.ll b/test/CodeGen/ARM64/weak-reference.ll
new file mode 100644
index 0000000000..b2135e0960
--- /dev/null
+++ b/test/CodeGen/ARM64/weak-reference.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s
+
+@x = extern_weak global i32
+
+define i32 @fn() nounwind ssp {
+; CHECK-LABEL: fn:
+; CHECK: .weak_reference
+  %val = load i32* @x, align 4
+  ret i32 %val
+}
diff --git a/test/CodeGen/ARM64/xaluo.ll b/test/CodeGen/ARM64/xaluo.ll
new file mode 100644
index 0000000000..6a8520d1c1
--- /dev/null
+++ b/test/CodeGen/ARM64/xaluo.ll
@@ -0,0 +1,524 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+;
+; Get the actual value of the overflow bit.
+;
+define i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  saddo.i32
+; CHECK:        adds w8, w0, w1
+; CHECK-NEXT:   csinc w0, wzr, wzr, vc
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  saddo.i64
+; CHECK:        adds x8, x0, x1
+; CHECK-NEXT:   csinc w0, wzr, wzr, vc
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  uaddo.i32
+; CHECK:        adds w8, w0, w1
+; CHECK-NEXT:   csinc w0, wzr, wzr, cc
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  uaddo.i64
+; CHECK:        adds x8, x0, x1
+; CHECK-NEXT:   csinc w0, wzr, wzr, cc
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  ssubo.i32
+; CHECK:        subs w8, w0, w1
+; CHECK-NEXT:   csinc w0, wzr, wzr, vc
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  ssubo.i64
+; CHECK:        subs x8, x0, x1
+; CHECK-NEXT:   csinc w0, wzr, wzr, vc
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  usubo.i32
+; CHECK:        subs w8, w0, w1
+; CHECK-NEXT:   csinc w0, wzr, wzr, cs
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  usubo.i64
+; CHECK:        subs x8, x0, x1
+; CHECK-NEXT:   csinc w0, wzr, wzr, cs
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  smulo.i32
+; CHECK:        smull x8, w0, w1
+; CHECK-NEXT:   lsr x9, x8, #32
+; CHECK-NEXT:   cmp w9, w8, asr #31
+; CHECK-NEXT:   csinc w0, wzr, wzr, eq
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  smulo.i64
+; CHECK:        mul x8, x0, x1
+; CHECK-NEXT:   smulh x9, x0, x1
+; CHECK-NEXT:   cmp x9, x8, asr #63
+; CHECK-NEXT:   csinc w0, wzr, wzr, eq
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+define i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
+entry:
+; CHECK-LABEL:  umulo.i32
+; CHECK:        umull x8, w0, w1
+; CHECK-NEXT:   cmp xzr, x8, lsr #32
+; CHECK-NEXT:   csinc w0, wzr, wzr, eq
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
+entry:
+; CHECK-LABEL:  umulo.i64
+; CHECK:        umulh x8, x0, x1
+; CHECK-NEXT:   cmp xzr, x8
+; CHECK-NEXT:   csinc w8, wzr, wzr, eq
+; CHECK-NEXT:   mul x9, x0, x1
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+
+;
+; Check the use of the overflow bit in combination with a select instruction.
+;
+define i32 @saddo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  saddo.select.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, vs
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  saddo.select.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, vs
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @uaddo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.select.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, cs
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.select.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, cs
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @ssubo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.select.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, vs
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.select.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, vs
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @usubo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  usubo.select.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   csel w0, w0, w1, cc
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  usubo.select.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   csel x0, x0, x1, cc
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  smulo.select.i32
+; CHECK:        smull    x8, w0, w1
+; CHECK-NEXT:   lsr     x9, x8, #32
+; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK-NEXT:   csel    w0, w0, w1, ne
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  smulo.select.i64
+; CHECK:        mul      x8, x0, x1
+; CHECK-NEXT:   smulh   x9, x0, x1
+; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK-NEXT:   csel    x0, x0, x1, ne
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  umulo.select.i32
+; CHECK:        umull    x8, w0, w1
+; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK-NEXT:   csel    w0, w0, w1, ne
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  umulo.select.i64
+; CHECK:        umulh   x8, x0, x1
+; CHECK-NEXT:   cmp     xzr, x8
+; CHECK-NEXT:   csel    x0, x0, x1, ne
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+
+;
+; Check the use of the overflow bit in combination with a branch instruction.
+;
+define i1 @saddo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  saddo.br.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   b.vc
+  %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @saddo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  saddo.br.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   b.vc
+  %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.br.i32
+; CHECK:        cmn w0, w1
+; CHECK-NEXT:   b.cc
+  %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  uaddo.br.i64
+; CHECK:        cmn x0, x1
+; CHECK-NEXT:   b.cc
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.br.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   b.vc
+  %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  ssubo.br.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   b.vc
+  %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @usubo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  usubo.br.i32
+; CHECK:        cmp w0, w1
+; CHECK-NEXT:   b.cs
+  %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @usubo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  usubo.br.i64
+; CHECK:        cmp x0, x1
+; CHECK-NEXT:   b.cs
+  %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @smulo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  smulo.br.i32
+; CHECK:        smull    x8, w0, w1
+; CHECK-NEXT:   lsr     x9, x8, #32
+; CHECK-NEXT:   cmp     w9, w8, asr #31
+; CHECK-NEXT:   b.eq
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @smulo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  smulo.br.i64
+; CHECK:        mul      x8, x0, x1
+; CHECK-NEXT:   smulh   x9, x0, x1
+; CHECK-NEXT:   cmp     x9, x8, asr #63
+; CHECK-NEXT:   b.eq
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @umulo.br.i32(i32 %v1, i32 %v2) {
+entry:
+; CHECK-LABEL:  umulo.br.i32
+; CHECK:        umull    x8, w0, w1
+; CHECK-NEXT:   cmp     xzr, x8, lsr #32
+; CHECK-NEXT:   b.eq
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @umulo.br.i64(i64 %v1, i64 %v2) {
+entry:
+; CHECK-LABEL:  umulo.br.i64
+; CHECK:        umulh   x8, x0, x1
+; CHECK-NEXT:   cbz
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+
diff --git a/test/CodeGen/ARM64/zero-cycle-regmov.ll b/test/CodeGen/ARM64/zero-cycle-regmov.ll
new file mode 100644
index 0000000000..c56d607aa8
--- /dev/null
+++ b/test/CodeGen/ARM64/zero-cycle-regmov.ll
@@ -0,0 +1,17 @@
+; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; rdar://12254953
+
+define i32 @t(i32 %a, i32 %b, i32 %c, i32 %d) nounwind ssp {
+entry:
+; CHECK-LABEL: t:
+; CHECK: mov x0, [[REG1:x[0-9]+]]
+; CHECK: mov x1, [[REG2:x[0-9]+]]
+; CHECK: bl _foo
+; CHECK: mov x0, [[REG1]]
+; CHECK: mov x1, [[REG2]]
+  %call = call i32 @foo(i32 %c, i32 %d) nounwind
+  %call1 = call i32 @foo(i32 %c, i32 %d) nounwind
+  unreachable
+}
+
+declare i32 @foo(i32, i32)
diff --git a/test/CodeGen/ARM64/zero-cycle-zeroing.ll b/test/CodeGen/ARM64/zero-cycle-zeroing.ll
new file mode 100644
index 0000000000..349bb6fd78
--- /dev/null
+++ b/test/CodeGen/ARM64/zero-cycle-zeroing.ll
@@ -0,0 +1,49 @@
+; RUN: llc -mtriple=arm64-apple-ios -mcpu=cyclone < %s | FileCheck %s
+; rdar://11481771
+; rdar://13713797
+
+define void @t1() nounwind ssp {
+entry:
+; CHECK-LABEL: t1:
+; CHECK-NOT: fmov
+; CHECK: movi.2d v0, #0000000000000000
+; CHECK: movi.2d v1, #0000000000000000
+; CHECK: movi.2d v2, #0000000000000000
+; CHECK: movi.2d v3, #0000000000000000
+  tail call void @bar(double 0.000000e+00, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00) nounwind
+  ret void
+}
+
+define void @t2() nounwind ssp {
+entry:
+; CHECK-LABEL: t2:
+; CHECK-NOT: mov w0, wzr
+; CHECK: movz w0, #0
+; CHECK: movz w1, #0
+  tail call void @bari(i32 0, i32 0) nounwind
+  ret void
+}
+
+define void @t3() nounwind ssp {
+entry:
+; CHECK-LABEL: t3:
+; CHECK-NOT: mov x0, xzr
+; CHECK: movz x0, #0
+; CHECK: movz x1, #0
+  tail call void @barl(i64 0, i64 0) nounwind
+  ret void
+}
+
+define void @t4() nounwind ssp {
+; CHECK-LABEL: t4:
+; CHECK-NOT: fmov
+; CHECK: movi.2d v0, #0000000000000000
+; CHECK: movi.2d v1, #0000000000000000
+  tail call void @barf(float 0.000000e+00, float 0.000000e+00) nounwind
+  ret void
+}
+
+declare void @bar(double, double, double, double)
+declare void @bari(i32, i32)
+declare void @barl(i64, i64)
+declare void @barf(float, float)
diff --git a/test/CodeGen/ARM64/zext.ll b/test/CodeGen/ARM64/zext.ll
new file mode 100644
index 0000000000..8d9e5ea040
--- /dev/null
+++ b/test/CodeGen/ARM64/zext.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm64 | FileCheck %s
+
+define i64 @foo(i32 %a, i32 %b) nounwind readnone ssp {
+entry:
+; CHECK-LABEL: foo:
+; CHECK: add w0, w1, w0
+; CHECK: ret
+  %add = add i32 %b, %a
+  %conv = zext i32 %add to i64
+  ret i64 %conv
+}
diff --git a/test/CodeGen/ARM64/zextload-unscaled.ll b/test/CodeGen/ARM64/zextload-unscaled.ll
new file mode 100644
index 0000000000..c475dbd21e
--- /dev/null
+++ b/test/CodeGen/ARM64/zextload-unscaled.ll
@@ -0,0 +1,40 @@
+; RUN: llc -march=arm64 < %s | FileCheck %s
+
+@var32 = global i32 0
+
+define void @test_zextloadi1_unscaled(i1* %base) {
+; CHECK-LABEL: test_zextloadi1_unscaled:
+; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
+
+  %addr = getelementptr i1* %base, i32 -7
+  %val = load i1* %addr, align 1
+
+  %extended = zext i1 %val to i32
+  store i32 %extended, i32* @var32, align 4
+  ret void
+}
+
+define void @test_zextloadi8_unscaled(i8* %base) {
+; CHECK-LABEL: test_zextloadi8_unscaled:
+; CHECK: ldurb {{w[0-9]+}}, [{{x[0-9]+}}, #-7]
+
+  %addr = getelementptr i8* %base, i32 -7
+  %val = load i8* %addr, align 1
+
+  %extended = zext i8 %val to i32
+  store i32 %extended, i32* @var32, align 4
+  ret void
+}
+
+define void @test_zextloadi16_unscaled(i16* %base) {
+; CHECK-LABEL: test_zextloadi16_unscaled:
+; CHECK: ldurh {{w[0-9]+}}, [{{x[0-9]+}}, #-14]
+
+  %addr = getelementptr i16* %base, i32 -7
+  %val = load i16* %addr, align 2
+
+  %extended = zext i16 %val to i32
+  store i32 %extended, i32* @var32, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/ARM64/zip.ll b/test/CodeGen/ARM64/zip.ll
new file mode 100644
index 0000000000..d06a9f899d
--- /dev/null
+++ b/test/CodeGen/ARM64/zip.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -march=arm64 -arm64-neon-syntax=apple | FileCheck %s
+
+define <8 x i8> @vzipi8(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipi8:
+;CHECK: zip1.8b
+;CHECK: zip2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <4 x i16> @vzipi16(<4 x i16>* %A, <4 x i16>* %B) nounwind {
+;CHECK-LABEL: vzipi16:
+;CHECK: zip1.4h
+;CHECK: zip2.4h
+;CHECK-NEXT: add.4h
+	%tmp1 = load <4 x i16>* %A
+	%tmp2 = load <4 x i16>* %B
+	%tmp3 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+	%tmp4 = shufflevector <4 x i16> %tmp1, <4 x i16> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+        %tmp5 = add <4 x i16> %tmp3, %tmp4
+	ret <4 x i16> %tmp5
+}
+
+define <16 x i8> @vzipQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipQi8:
+;CHECK: zip1.16b
+;CHECK: zip2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
+
+define <8 x i16> @vzipQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind {
+;CHECK-LABEL: vzipQi16:
+;CHECK: zip1.8h
+;CHECK: zip2.8h
+;CHECK-NEXT: add.8h
+	%tmp1 = load <8 x i16>* %A
+	%tmp2 = load <8 x i16>* %B
+	%tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+	%tmp4 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+        %tmp5 = add <8 x i16> %tmp3, %tmp4
+	ret <8 x i16> %tmp5
+}
+
+define <4 x i32> @vzipQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind {
+;CHECK-LABEL: vzipQi32:
+;CHECK: zip1.4s
+;CHECK: zip2.4s
+;CHECK-NEXT: add.4s
+	%tmp1 = load <4 x i32>* %A
+	%tmp2 = load <4 x i32>* %B
+	%tmp3 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+	%tmp4 = shufflevector <4 x i32> %tmp1, <4 x i32> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+        %tmp5 = add <4 x i32> %tmp3, %tmp4
+	ret <4 x i32> %tmp5
+}
+
+define <4 x float> @vzipQf(<4 x float>* %A, <4 x float>* %B) nounwind {
+;CHECK-LABEL: vzipQf:
+;CHECK: zip1.4s
+;CHECK: zip2.4s
+;CHECK-NEXT: fadd.4s
+	%tmp1 = load <4 x float>* %A
+	%tmp2 = load <4 x float>* %B
+	%tmp3 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+	%tmp4 = shufflevector <4 x float> %tmp1, <4 x float> %tmp2, <4 x i32> <i32 2, i32 6, i32 3, i32 7>
+        %tmp5 = fadd <4 x float> %tmp3, %tmp4
+	ret <4 x float> %tmp5
+}
+
+; Undef shuffle indices should not prevent matching to VZIP:
+
+define <8 x i8> @vzipi8_undef(<8 x i8>* %A, <8 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipi8_undef:
+;CHECK: zip1.8b
+;CHECK: zip2.8b
+;CHECK-NEXT: add.8b
+	%tmp1 = load <8 x i8>* %A
+	%tmp2 = load <8 x i8>* %B
+	%tmp3 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 0, i32 undef, i32 1, i32 9, i32 undef, i32 10, i32 3, i32 11>
+	%tmp4 = shufflevector <8 x i8> %tmp1, <8 x i8> %tmp2, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 undef, i32 undef, i32 15>
+        %tmp5 = add <8 x i8> %tmp3, %tmp4
+	ret <8 x i8> %tmp5
+}
+
+define <16 x i8> @vzipQi8_undef(<16 x i8>* %A, <16 x i8>* %B) nounwind {
+;CHECK-LABEL: vzipQi8_undef:
+;CHECK: zip1.16b
+;CHECK: zip2.16b
+;CHECK-NEXT: add.16b
+	%tmp1 = load <16 x i8>* %A
+	%tmp2 = load <16 x i8>* %B
+	%tmp3 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 0, i32 16, i32 1, i32 undef, i32 undef, i32 undef, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+	%tmp4 = shufflevector <16 x i8> %tmp1, <16 x i8> %tmp2, <16 x i32> <i32 8, i32 24, i32 9, i32 undef, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 undef, i32 14, i32 30, i32 undef, i32 31>
+        %tmp5 = add <16 x i8> %tmp3, %tmp4
+	ret <16 x i8> %tmp5
+}
diff --git a/test/DebugInfo/ARM64/lit.local.cfg b/test/DebugInfo/ARM64/lit.local.cfg
new file mode 100644
index 0000000000..a75a42b6f7
--- /dev/null
+++ b/test/DebugInfo/ARM64/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'ARM64' in targets:
+    config.unsupported = True
+
diff --git a/test/DebugInfo/ARM64/struct_by_value.ll b/test/DebugInfo/ARM64/struct_by_value.ll
new file mode 100644
index 0000000000..0023c3d6ea
--- /dev/null
+++ b/test/DebugInfo/ARM64/struct_by_value.ll
@@ -0,0 +1,68 @@
+; A by-value struct is a register-indirect value (breg).
+; RUN: llc %s -filetype=asm -o - | FileCheck %s
+
+; CHECK: DW_OP_breg0
+
+; rdar://problem/13658587
+;
+; Generated from
+;
+; struct five
+; {
+;   int a;
+;   int b;
+;   int c;
+;   int d;
+;   int e;
+; };
+;
+; int
+; return_five_int (struct five f)
+; {
+;   return f.a;
+; }
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios3.0.0"
+
+%struct.five = type { i32, i32, i32, i32, i32 }
+
+; Function Attrs: nounwind ssp
+define i32 @return_five_int(%struct.five* %f) #0 {
+entry:
+  call void @llvm.dbg.declare(metadata !{%struct.five* %f}, metadata !17), !dbg !18
+  %a = getelementptr inbounds %struct.five* %f, i32 0, i32 0, !dbg !19
+  %0 = load i32* %a, align 4, !dbg !19
+  ret i32 %0, !dbg !19
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+attributes #0 = { nounwind ssp }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!16, !20}
+
+!0 = metadata !{i32 786449, metadata !1, i32 12, metadata !"LLVM version 3.4 ", i1 false, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, metadata !2, metadata !""} ; [ DW_TAG_compile_unit ] [struct_by_value.c] [DW_LANG_C99]
+!1 = metadata !{metadata !"struct_by_value.c", metadata !""}
+!2 = metadata !{}
+!3 = metadata !{metadata !4}
+!4 = metadata !{i32 786478, metadata !1, metadata !5, metadata !"return_five_int", metadata !"return_five_int", metadata !"", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (%struct.five*)* @return_five_int, null, null, metadata !2, i32 14} ; [ DW_TAG_subprogram ] [line 13] [def] [scope 14] [return_five_int]
+!5 = metadata !{i32 786473, metadata !1}          ; [ DW_TAG_file_type ] [struct_by_value.c]
+!6 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !7, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!7 = metadata !{metadata !8, metadata !9}
+!8 = metadata !{i32 786468, null, null, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!9 = metadata !{i32 786451, metadata !1, null, metadata !"five", i32 1, i64 160, i64 32, i32 0, i32 0, null, metadata !10, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [five] [line 1, size 160, align 32, offset 0] [def] [from ]
+!10 = metadata !{metadata !11, metadata !12, metadata !13, metadata !14, metadata !15}
+!11 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"a", i32 3, i64 32, i64 32, i64 0, i32 0, metadata !8} ; [ DW_TAG_member ] [a] [line 3, size 32, align 32, offset 0] [from int]
+!12 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"b", i32 4, i64 32, i64 32, i64 32, i32 0, metadata !8} ; [ DW_TAG_member ] [b] [line 4, size 32, align 32, offset 32] [from int]
+!13 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"c", i32 5, i64 32, i64 32, i64 64, i32 0, metadata !8} ; [ DW_TAG_member ] [c] [line 5, size 32, align 32, offset 64] [from int]
+!14 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"d", i32 6, i64 32, i64 32, i64 96, i32 0, metadata !8} ; [ DW_TAG_member ] [d] [line 6, size 32, align 32, offset 96] [from int]
+!15 = metadata !{i32 786445, metadata !1, metadata !9, metadata !"e", i32 7, i64 32, i64 32, i64 128, i32 0, metadata !8} ; [ DW_TAG_member ] [e] [line 7, size 32, align 32, offset 128] [from int]
+!16 = metadata !{i32 2, metadata !"Dwarf Version", i32 2}
+!17 = metadata !{i32 786689, metadata !4, metadata !"f", metadata !5, i32 16777229, metadata !9, i32 8192, i32 0} ; [ DW_TAG_arg_variable ] [f] [line 13]
+!18 = metadata !{i32 13, i32 0, metadata !4, null}
+!19 = metadata !{i32 16, i32 0, metadata !4, null}
+!20 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/MC/ARM64/advsimd.s b/test/MC/ARM64/advsimd.s
new file mode 100644
index 0000000000..fce0832f12
--- /dev/null
+++ b/test/MC/ARM64/advsimd.s
@@ -0,0 +1,1997 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -output-asm-variant=1 -show-encoding < %s | FileCheck %s
+
+foo:
+
+  abs.8b  v0, v0
+  abs.16b v0, v0
+  abs.4h  v0, v0
+  abs.8h  v0, v0
+  abs.2s  v0, v0
+  abs.4s  v0, v0
+
+; CHECK: abs.8b  v0, v0              ; encoding: [0x00,0xb8,0x20,0x0e]
+; CHECK: abs.16b v0, v0              ; encoding: [0x00,0xb8,0x20,0x4e]
+; CHECK: abs.4h  v0, v0              ; encoding: [0x00,0xb8,0x60,0x0e]
+; CHECK: abs.8h  v0, v0              ; encoding: [0x00,0xb8,0x60,0x4e]
+; CHECK: abs.2s  v0, v0              ; encoding: [0x00,0xb8,0xa0,0x0e]
+; CHECK: abs.4s  v0, v0              ; encoding: [0x00,0xb8,0xa0,0x4e]
+
+  add.8b  v0, v0, v0
+  add.16b v0, v0, v0
+  add.4h  v0, v0, v0
+  add.8h  v0, v0, v0
+  add.2s  v0, v0, v0
+  add.4s  v0, v0, v0
+  add.2d  v0, v0, v0
+
+; CHECK: add.8b  v0, v0, v0          ; encoding: [0x00,0x84,0x20,0x0e]
+; CHECK: add.16b v0, v0, v0          ; encoding: [0x00,0x84,0x20,0x4e]
+; CHECK: add.4h  v0, v0, v0          ; encoding: [0x00,0x84,0x60,0x0e]
+; CHECK: add.8h  v0, v0, v0          ; encoding: [0x00,0x84,0x60,0x4e]
+; CHECK: add.2s  v0, v0, v0          ; encoding: [0x00,0x84,0xa0,0x0e]
+; CHECK: add.4s  v0, v0, v0          ; encoding: [0x00,0x84,0xa0,0x4e]
+; CHECK: add.2d  v0, v0, v0          ; encoding: [0x00,0x84,0xe0,0x4e]
+
+  add d1, d2, d3
+
+; CHECK: add d1, d2, d3              ; encoding: [0x41,0x84,0xe3,0x5e]
+
+  addhn.8b   v0, v0, v0
+  addhn2.16b v0, v0, v0
+  addhn.4h   v0, v0, v0
+  addhn2.8h  v0, v0, v0
+  addhn.2s   v0, v0, v0
+  addhn2.4s  v0, v0, v0
+
+; CHECK: addhn.8b   v0, v0, v0       ; encoding: [0x00,0x40,0x20,0x0e]
+; CHECK: addhn2.16b v0, v0, v0       ; encoding: [0x00,0x40,0x20,0x4e]
+; CHECK: addhn.4h   v0, v0, v0       ; encoding: [0x00,0x40,0x60,0x0e]
+; CHECK: addhn2.8h  v0, v0, v0       ; encoding: [0x00,0x40,0x60,0x4e]
+; CHECK: addhn.2s   v0, v0, v0       ; encoding: [0x00,0x40,0xa0,0x0e]
+; CHECK: addhn2.4s  v0, v0, v0       ; encoding: [0x00,0x40,0xa0,0x4e]
+
+  addp.8b  v0, v0, v0
+  addp.16b v0, v0, v0
+  addp.4h  v0, v0, v0
+  addp.8h  v0, v0, v0
+  addp.2s  v0, v0, v0
+  addp.4s  v0, v0, v0
+  addp.2d  v0, v0, v0
+
+; CHECK: addp.8b   v0, v0, v0        ; encoding: [0x00,0xbc,0x20,0x0e]
+; CHECK: addp.16b  v0, v0, v0        ; encoding: [0x00,0xbc,0x20,0x4e]
+; CHECK: addp.4h   v0, v0, v0        ; encoding: [0x00,0xbc,0x60,0x0e]
+; CHECK: addp.8h   v0, v0, v0        ; encoding: [0x00,0xbc,0x60,0x4e]
+; CHECK: addp.2s   v0, v0, v0        ; encoding: [0x00,0xbc,0xa0,0x0e]
+; CHECK: addp.4s   v0, v0, v0        ; encoding: [0x00,0xbc,0xa0,0x4e]
+; CHECK: addp.2d   v0, v0, v0        ; encoding: [0x00,0xbc,0xe0,0x4e]
+
+  addp.2d  d0, v0
+
+; CHECK: addp.2d d0, v0              ; encoding: [0x00,0xb8,0xf1,0x5e]
+
+  addv.8b  b0, v0
+  addv.16b b0, v0
+  addv.4h  h0, v0
+  addv.8h  h0, v0
+  addv.4s  s0, v0
+
+; CHECK: addv.8b  b0, v0             ; encoding: [0x00,0xb8,0x31,0x0e]
+; CHECK: addv.16b b0, v0             ; encoding: [0x00,0xb8,0x31,0x4e]
+; CHECK: addv.4h  h0, v0             ; encoding: [0x00,0xb8,0x71,0x0e]
+; CHECK: addv.8h  h0, v0             ; encoding: [0x00,0xb8,0x71,0x4e]
+; CHECK: addv.4s  s0, v0             ; encoding: [0x00,0xb8,0xb1,0x4e]
+
+
+; INS/DUP
+  dup.2d  v0, x3
+  dup.4s  v0, w3
+  dup.2s  v0, w3
+  dup.8h  v0, w3
+  dup.4h  v0, w3
+  dup.16b v0, w3
+  dup.8b  v0, w3
+
+  dup v1.2d, x3
+  dup v2.4s, w4
+  dup v3.2s, w5
+  dup v4.8h, w6
+  dup v5.4h, w7
+  dup v6.16b, w8
+  dup v7.8b, w9
+
+; CHECK: dup.2d  v0, x3              ; encoding: [0x60,0x0c,0x08,0x4e]
+; CHECK: dup.4s  v0, w3              ; encoding: [0x60,0x0c,0x04,0x4e]
+; CHECK: dup.2s  v0, w3              ; encoding: [0x60,0x0c,0x04,0x0e]
+; CHECK: dup.8h  v0, w3              ; encoding: [0x60,0x0c,0x02,0x4e]
+; CHECK: dup.4h  v0, w3              ; encoding: [0x60,0x0c,0x02,0x0e]
+; CHECK: dup.16b v0, w3              ; encoding: [0x60,0x0c,0x01,0x4e]
+; CHECK: dup.8b  v0, w3              ; encoding: [0x60,0x0c,0x01,0x0e]
+
+; CHECK: dup.2d	v1, x3               ; encoding: [0x61,0x0c,0x08,0x4e]
+; CHECK: dup.4s	v2, w4               ; encoding: [0x82,0x0c,0x04,0x4e]
+; CHECK: dup.2s	v3, w5               ; encoding: [0xa3,0x0c,0x04,0x0e]
+; CHECK: dup.8h	v4, w6               ; encoding: [0xc4,0x0c,0x02,0x4e]
+; CHECK: dup.4h	v5, w7               ; encoding: [0xe5,0x0c,0x02,0x0e]
+; CHECK: dup.16b v6, w8              ; encoding: [0x06,0x0d,0x01,0x4e]
+; CHECK: dup.8b	v7, w9               ; encoding: [0x27,0x0d,0x01,0x0e]
+
+  dup.2d  v0, v3[1]
+  dup.2s  v0, v3[1]
+  dup.4s  v0, v3[1]
+  dup.4h  v0, v3[1]
+  dup.8h  v0, v3[1]
+  dup.8b  v0, v3[1]
+  dup.16b v0, v3[1]
+
+  dup v7.2d, v9.d[1]
+  dup v6.2s, v8.s[1]
+  dup v5.4s, v7.s[2]
+  dup v4.4h, v6.h[3]
+  dup v3.8h, v5.h[4]
+  dup v2.8b, v4.b[5]
+  dup v1.16b, v3.b[6]
+
+; CHECK: dup.2d  v0, v3[1]           ; encoding: [0x60,0x04,0x18,0x4e]
+; CHECK: dup.2s  v0, v3[1]           ; encoding: [0x60,0x04,0x0c,0x0e]
+; CHECK: dup.4s  v0, v3[1]           ; encoding: [0x60,0x04,0x0c,0x4e]
+; CHECK: dup.4h  v0, v3[1]           ; encoding: [0x60,0x04,0x06,0x0e]
+; CHECK: dup.8h  v0, v3[1]           ; encoding: [0x60,0x04,0x06,0x4e]
+; CHECK: dup.8b  v0, v3[1]           ; encoding: [0x60,0x04,0x03,0x0e]
+; CHECK: dup.16b v0, v3[1]           ; encoding: [0x60,0x04,0x03,0x4e]
+
+; CHECK: dup.2d  v7, v9[1]            ; encoding: [0x27,0x05,0x18,0x4e]
+; CHECK: dup.2s  v6, v8[1]            ; encoding: [0x06,0x05,0x0c,0x0e]
+; CHECK: dup.4s  v5, v7[2]            ; encoding: [0xe5,0x04,0x14,0x4e]
+; CHECK: dup.4h  v4, v6[3]            ; encoding: [0xc4,0x04,0x0e,0x0e]
+; CHECK: dup.8h  v3, v5[4]            ; encoding: [0xa3,0x04,0x12,0x4e]
+; CHECK: dup.8b  v2, v4[5]            ; encoding: [0x82,0x04,0x0b,0x0e]
+; CHECK: dup.16b v1, v3[6]            ; encoding: [0x61,0x04,0x0d,0x4e]
+
+  dup b3, v4[1]
+  dup h3, v4[1]
+  dup s3, v4[1]
+  dup d3, v4[1]
+  dup b3, v4.b[1]
+  dup h3, v4.h[1]
+  dup s3, v4.s[1]
+  dup d3, v4.d[1]
+
+  mov b3, v4[1]
+  mov h3, v4[1]
+  mov s3, v4[1]
+  mov d3, v4[1]
+  mov b3, v4.b[1]
+  mov h3, v4.h[1]
+  mov s3, v4.s[1]
+  mov d3, v4.d[1]
+
+; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
+; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
+; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
+; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
+; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
+; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
+; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
+; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
+
+; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
+; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
+; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
+; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
+; CHECK: mov b3, v4[1]               ; encoding: [0x83,0x04,0x03,0x5e]
+; CHECK: mov h3, v4[1]               ; encoding: [0x83,0x04,0x06,0x5e]
+; CHECK: mov s3, v4[1]               ; encoding: [0x83,0x04,0x0c,0x5e]
+; CHECK: mov d3, v4[1]               ; encoding: [0x83,0x04,0x18,0x5e]
+
+  smov.s x3, v2[2]
+  smov   x3, v2.s[2]
+  umov.s w3, v2[2]
+  umov   w3, v2.s[2]
+  umov.d x3, v2[1]
+  umov   x3, v2.d[1]
+
+; CHECK: smov.s  x3, v2[2]           ; encoding: [0x43,0x2c,0x14,0x4e]
+; CHECK: smov.s  x3, v2[2]           ; encoding: [0x43,0x2c,0x14,0x4e]
+; CHECK: umov.s  w3, v2[2]           ; encoding: [0x43,0x3c,0x14,0x0e]
+; CHECK: umov.s  w3, v2[2]           ; encoding: [0x43,0x3c,0x14,0x0e]
+; CHECK: umov.d  x3, v2[1]           ; encoding: [0x43,0x3c,0x18,0x4e]
+; CHECK: umov.d  x3, v2[1]           ; encoding: [0x43,0x3c,0x18,0x4e]
+
+  ; MOV aliases for UMOV instructions above
+
+  mov.s w2, v3[3]
+  mov   w5, v7.s[2]
+  mov.d x11, v13[1]
+  mov   x17, v19.d[0]
+
+; CHECK: umov.s  w2, v3[3]               ; encoding: [0x62,0x3c,0x1c,0x0e]
+; CHECK: umov.s  w5, v7[2]               ; encoding: [0xe5,0x3c,0x14,0x0e]
+; CHECK: umov.d  x11, v13[1]             ; encoding: [0xab,0x3d,0x18,0x4e]
+; CHECK: umov.d  x17, v19[0]             ; encoding: [0x71,0x3e,0x08,0x4e]
+
+  ins.d v2[1], x5
+  ins.s v2[1], w5
+  ins.h v2[1], w5
+  ins.b v2[1], w5
+
+  ins   v2.d[1], x5
+  ins   v2.s[1], w5
+  ins   v2.h[1], w5
+  ins   v2.b[1], w5
+
+; CHECK: ins.d v2[1], x5             ; encoding: [0xa2,0x1c,0x18,0x4e]
+; CHECK: ins.s v2[1], w5             ; encoding: [0xa2,0x1c,0x0c,0x4e]
+; CHECK: ins.h v2[1], w5             ; encoding: [0xa2,0x1c,0x06,0x4e]
+; CHECK: ins.b v2[1], w5             ; encoding: [0xa2,0x1c,0x03,0x4e]
+
+; CHECK: ins.d v2[1], x5             ; encoding: [0xa2,0x1c,0x18,0x4e]
+; CHECK: ins.s v2[1], w5             ; encoding: [0xa2,0x1c,0x0c,0x4e]
+; CHECK: ins.h v2[1], w5             ; encoding: [0xa2,0x1c,0x06,0x4e]
+; CHECK: ins.b v2[1], w5             ; encoding: [0xa2,0x1c,0x03,0x4e]
+
+  ins.d v2[1], v15[1]
+  ins.s v2[1], v15[1]
+  ins.h v2[1], v15[1]
+  ins.b v2[1], v15[1]
+
+  ins   v2.d[1], v15.d[0]
+  ins   v2.s[3], v15.s[2]
+  ins   v2.h[7], v15.h[3]
+  ins   v2.b[10], v15.b[5]
+
+; CHECK: ins.d v2[1], v15[1]         ; encoding: [0xe2,0x45,0x18,0x6e]
+; CHECK: ins.s v2[1], v15[1]         ; encoding: [0xe2,0x25,0x0c,0x6e]
+; CHECK: ins.h v2[1], v15[1]         ; encoding: [0xe2,0x15,0x06,0x6e]
+; CHECK: ins.b v2[1], v15[1]         ; encoding: [0xe2,0x0d,0x03,0x6e]
+
+; CHECK: ins.d v2[1], v15[0]         ; encoding: [0xe2,0x05,0x18,0x6e]
+; CHECK: ins.s v2[3], v15[2]         ; encoding: [0xe2,0x45,0x1c,0x6e]
+; CHECK: ins.h v2[7], v15[3]         ; encoding: [0xe2,0x35,0x1e,0x6e]
+; CHECK: ins.b v2[10], v15[5]        ; encoding: [0xe2,0x2d,0x15,0x6e]
+
+; MOV aliases for the above INS instructions.
+  mov.d v2[1], x5
+  mov.s v3[1], w6
+  mov.h v4[1], w7
+  mov.b v5[1], w8
+
+  mov   v9.d[1], x2
+  mov   v8.s[1], w3
+  mov   v7.h[1], w4
+  mov   v6.b[1], w5
+
+  mov.d v1[1], v10[1]
+  mov.s v2[1], v11[1]
+  mov.h v7[1], v12[1]
+  mov.b v8[1], v15[1]
+
+  mov   v2.d[1], v15.d[0]
+  mov   v7.s[3], v16.s[2]
+  mov   v8.h[7], v17.h[3]
+  mov   v9.b[10], v18.b[5]
+
+; CHECK: ins.d	v2[1], x5               ; encoding: [0xa2,0x1c,0x18,0x4e]
+; CHECK: ins.s	v3[1], w6               ; encoding: [0xc3,0x1c,0x0c,0x4e]
+; CHECK: ins.h	v4[1], w7               ; encoding: [0xe4,0x1c,0x06,0x4e]
+; CHECK: ins.b	v5[1], w8               ; encoding: [0x05,0x1d,0x03,0x4e]
+; CHECK: ins.d	v9[1], x2               ; encoding: [0x49,0x1c,0x18,0x4e]
+; CHECK: ins.s	v8[1], w3               ; encoding: [0x68,0x1c,0x0c,0x4e]
+; CHECK: ins.h	v7[1], w4               ; encoding: [0x87,0x1c,0x06,0x4e]
+; CHECK: ins.b	v6[1], w5               ; encoding: [0xa6,0x1c,0x03,0x4e]
+; CHECK: ins.d	v1[1], v10[1]           ; encoding: [0x41,0x45,0x18,0x6e]
+; CHECK: ins.s	v2[1], v11[1]           ; encoding: [0x62,0x25,0x0c,0x6e]
+; CHECK: ins.h	v7[1], v12[1]           ; encoding: [0x87,0x15,0x06,0x6e]
+; CHECK: ins.b	v8[1], v15[1]           ; encoding: [0xe8,0x0d,0x03,0x6e]
+; CHECK: ins.d	v2[1], v15[0]           ; encoding: [0xe2,0x05,0x18,0x6e]
+; CHECK: ins.s	v7[3], v16[2]           ; encoding: [0x07,0x46,0x1c,0x6e]
+; CHECK: ins.h	v8[7], v17[3]           ; encoding: [0x28,0x36,0x1e,0x6e]
+; CHECK: ins.b	v9[10], v18[5]          ; encoding: [0x49,0x2e,0x15,0x6e]
+
+
+  and.8b  v0, v0, v0
+  and.16b v0, v0, v0
+
+; CHECK: and.8b  v0, v0, v0          ; encoding: [0x00,0x1c,0x20,0x0e]
+; CHECK: and.16b v0, v0, v0          ; encoding: [0x00,0x1c,0x20,0x4e]
+
+  bic.8b  v0, v0, v0
+
+; CHECK: bic.8b  v0, v0, v0          ; encoding: [0x00,0x1c,0x60,0x0e]
+
+  cmeq.8b v0, v0, v0
+  cmge.8b v0, v0, v0
+  cmgt.8b v0, v0, v0
+  cmhi.8b v0, v0, v0
+  cmhs.8b v0, v0, v0
+  cmtst.8b v0, v0, v0
+  fabd.2s v0, v0, v0
+  facge.2s  v0, v0, v0
+  facgt.2s  v0, v0, v0
+  faddp.2s v0, v0, v0
+  fadd.2s v0, v0, v0
+  fcmeq.2s  v0, v0, v0
+  fcmge.2s  v0, v0, v0
+  fcmgt.2s  v0, v0, v0
+  fdiv.2s v0, v0, v0
+  fmaxnmp.2s v0, v0, v0
+  fmaxnm.2s v0, v0, v0
+  fmaxp.2s v0, v0, v0
+  fmax.2s v0, v0, v0
+  fminnmp.2s v0, v0, v0
+  fminnm.2s v0, v0, v0
+  fminp.2s v0, v0, v0
+  fmin.2s v0, v0, v0
+  fmla.2s v0, v0, v0
+  fmls.2s v0, v0, v0
+  fmulx.2s v0, v0, v0
+  fmul.2s v0, v0, v0
+  fmulx	d2, d3, d1
+  fmulx	s2, s3, s1
+  frecps.2s v0, v0, v0
+  frsqrts.2s v0, v0, v0
+  fsub.2s v0, v0, v0
+  mla.8b v0, v0, v0
+  mls.8b v0, v0, v0
+  mul.8b v0, v0, v0
+  pmul.8b v0, v0, v0
+  saba.8b v0, v0, v0
+  sabd.8b v0, v0, v0
+  shadd.8b v0, v0, v0
+  shsub.8b v0, v0, v0
+  smaxp.8b v0, v0, v0
+  smax.8b v0, v0, v0
+  sminp.8b v0, v0, v0
+  smin.8b v0, v0, v0
+  sqadd.8b v0, v0, v0
+  sqdmulh.4h v0, v0, v0
+  sqrdmulh.4h v0, v0, v0
+  sqrshl.8b v0, v0, v0
+  sqshl.8b v0, v0, v0
+  sqsub.8b v0, v0, v0
+  srhadd.8b v0, v0, v0
+  srshl.8b v0, v0, v0
+  sshl.8b v0, v0, v0
+  sub.8b v0, v0, v0
+  uaba.8b v0, v0, v0
+  uabd.8b v0, v0, v0
+  uhadd.8b v0, v0, v0
+  uhsub.8b v0, v0, v0
+  umaxp.8b v0, v0, v0
+  umax.8b v0, v0, v0
+  uminp.8b v0, v0, v0
+  umin.8b v0, v0, v0
+  uqadd.8b v0, v0, v0
+  uqrshl.8b v0, v0, v0
+  uqshl.8b v0, v0, v0
+  uqsub.8b v0, v0, v0
+  urhadd.8b v0, v0, v0
+  urshl.8b v0, v0, v0
+  ushl.8b v0, v0, v0
+
+; CHECK: cmeq.8b	v0, v0, v0              ; encoding: [0x00,0x8c,0x20,0x2e]
+; CHECK: cmge.8b	v0, v0, v0              ; encoding: [0x00,0x3c,0x20,0x0e]
+; CHECK: cmgt.8b	v0, v0, v0              ; encoding: [0x00,0x34,0x20,0x0e]
+; CHECK: cmhi.8b	v0, v0, v0              ; encoding: [0x00,0x34,0x20,0x2e]
+; CHECK: cmhs.8b	v0, v0, v0              ; encoding: [0x00,0x3c,0x20,0x2e]
+; CHECK: cmtst.8b	v0, v0, v0      ; encoding: [0x00,0x8c,0x20,0x0e]
+; CHECK: fabd.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0xa0,0x2e]
+; CHECK: facge.2s	v0, v0, v0      ; encoding: [0x00,0xec,0x20,0x2e]
+; CHECK: facgt.2s	v0, v0, v0      ; encoding: [0x00,0xec,0xa0,0x2e]
+; CHECK: faddp.2s	v0, v0, v0      ; encoding: [0x00,0xd4,0x20,0x2e]
+; CHECK: fadd.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0x20,0x0e]
+; CHECK: fcmeq.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0x20,0x0e]
+; CHECK: fcmge.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0x20,0x2e]
+; CHECK: fcmgt.2s	v0, v0, v0      ; encoding: [0x00,0xe4,0xa0,0x2e]
+; CHECK: fdiv.2s	v0, v0, v0              ; encoding: [0x00,0xfc,0x20,0x2e]
+; CHECK: fmaxnmp.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0x20,0x2e]
+; CHECK: fmaxnm.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0x20,0x0e]
+; CHECK: fmaxp.2s	v0, v0, v0      ; encoding: [0x00,0xf4,0x20,0x2e]
+; CHECK: fmax.2s	v0, v0, v0              ; encoding: [0x00,0xf4,0x20,0x0e]
+; CHECK: fminnmp.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0xa0,0x2e]
+; CHECK: fminnm.2s	v0, v0, v0      ; encoding: [0x00,0xc4,0xa0,0x0e]
+; CHECK: fminp.2s	v0, v0, v0      ; encoding: [0x00,0xf4,0xa0,0x2e]
+; CHECK: fmin.2s	v0, v0, v0              ; encoding: [0x00,0xf4,0xa0,0x0e]
+; CHECK: fmla.2s	v0, v0, v0              ; encoding: [0x00,0xcc,0x20,0x0e]
+; CHECK: fmls.2s	v0, v0, v0              ; encoding: [0x00,0xcc,0xa0,0x0e]
+; CHECK: fmulx.2s	v0, v0, v0      ; encoding: [0x00,0xdc,0x20,0x0e]
+
+; CHECK: fmul.2s	v0, v0, v0              ; encoding: [0x00,0xdc,0x20,0x2e]
+; CHECK: fmulx	d2, d3, d1              ; encoding: [0x62,0xdc,0x61,0x5e]
+; CHECK: fmulx	s2, s3, s1              ; encoding: [0x62,0xdc,0x21,0x5e]
+; CHECK: frecps.2s	v0, v0, v0      ; encoding: [0x00,0xfc,0x20,0x0e]
+; CHECK: frsqrts.2s	v0, v0, v0      ; encoding: [0x00,0xfc,0xa0,0x0e]
+; CHECK: fsub.2s	v0, v0, v0              ; encoding: [0x00,0xd4,0xa0,0x0e]
+; CHECK: mla.8b	v0, v0, v0              ; encoding: [0x00,0x94,0x20,0x0e]
+; CHECK: mls.8b	v0, v0, v0              ; encoding: [0x00,0x94,0x20,0x2e]
+; CHECK: mul.8b	v0, v0, v0              ; encoding: [0x00,0x9c,0x20,0x0e]
+; CHECK: pmul.8b	v0, v0, v0              ; encoding: [0x00,0x9c,0x20,0x2e]
+; CHECK: saba.8b	v0, v0, v0              ; encoding: [0x00,0x7c,0x20,0x0e]
+; CHECK: sabd.8b	v0, v0, v0              ; encoding: [0x00,0x74,0x20,0x0e]
+; CHECK: shadd.8b	v0, v0, v0      ; encoding: [0x00,0x04,0x20,0x0e]
+; CHECK: shsub.8b	v0, v0, v0      ; encoding: [0x00,0x24,0x20,0x0e]
+; CHECK: smaxp.8b	v0, v0, v0      ; encoding: [0x00,0xa4,0x20,0x0e]
+; CHECK: smax.8b	v0, v0, v0              ; encoding: [0x00,0x64,0x20,0x0e]
+; CHECK: sminp.8b	v0, v0, v0      ; encoding: [0x00,0xac,0x20,0x0e]
+; CHECK: smin.8b	v0, v0, v0              ; encoding: [0x00,0x6c,0x20,0x0e]
+; CHECK: sqadd.8b	v0, v0, v0      ; encoding: [0x00,0x0c,0x20,0x0e]
+; CHECK: sqdmulh.4h v0, v0, v0 ; encoding: [0x00,0xb4,0x60,0x0e]
+; CHECK: sqrdmulh.4h v0, v0, v0 ; encoding: [0x00,0xb4,0x60,0x2e]
+; CHECK: sqrshl.8b	v0, v0, v0      ; encoding: [0x00,0x5c,0x20,0x0e]
+; CHECK: sqshl.8b	v0, v0, v0      ; encoding: [0x00,0x4c,0x20,0x0e]
+; CHECK: sqsub.8b	v0, v0, v0      ; encoding: [0x00,0x2c,0x20,0x0e]
+; CHECK: srhadd.8b	v0, v0, v0      ; encoding: [0x00,0x14,0x20,0x0e]
+; CHECK: srshl.8b	v0, v0, v0      ; encoding: [0x00,0x54,0x20,0x0e]
+; CHECK: sshl.8b	v0, v0, v0              ; encoding: [0x00,0x44,0x20,0x0e]
+; CHECK: sub.8b	v0, v0, v0              ; encoding: [0x00,0x84,0x20,0x2e]
+; CHECK: uaba.8b	v0, v0, v0              ; encoding: [0x00,0x7c,0x20,0x2e]
+; CHECK: uabd.8b	v0, v0, v0              ; encoding: [0x00,0x74,0x20,0x2e]
+; CHECK: uhadd.8b	v0, v0, v0      ; encoding: [0x00,0x04,0x20,0x2e]
+; CHECK: uhsub.8b	v0, v0, v0      ; encoding: [0x00,0x24,0x20,0x2e]
+; CHECK: umaxp.8b	v0, v0, v0      ; encoding: [0x00,0xa4,0x20,0x2e]
+; CHECK: umax.8b	v0, v0, v0              ; encoding: [0x00,0x64,0x20,0x2e]
+; CHECK: uminp.8b	v0, v0, v0      ; encoding: [0x00,0xac,0x20,0x2e]
+; CHECK: umin.8b	v0, v0, v0              ; encoding: [0x00,0x6c,0x20,0x2e]
+; CHECK: uqadd.8b	v0, v0, v0      ; encoding: [0x00,0x0c,0x20,0x2e]
+; CHECK: uqrshl.8b	v0, v0, v0      ; encoding: [0x00,0x5c,0x20,0x2e]
+; CHECK: uqshl.8b	v0, v0, v0      ; encoding: [0x00,0x4c,0x20,0x2e]
+; CHECK: uqsub.8b	v0, v0, v0      ; encoding: [0x00,0x2c,0x20,0x2e]
+; CHECK: urhadd.8b	v0, v0, v0      ; encoding: [0x00,0x14,0x20,0x2e]
+; CHECK: urshl.8b	v0, v0, v0      ; encoding: [0x00,0x54,0x20,0x2e]
+; CHECK: ushl.8b	v0, v0, v0              ; encoding: [0x00,0x44,0x20,0x2e]
+
+  bif.8b v0, v0, v0
+  bit.8b v0, v0, v0
+  bsl.8b v0, v0, v0
+  eor.8b v0, v0, v0
+  orn.8b v0, v0, v0
+  orr.8b v0, v0, v0
+
+; CHECK: bif.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xe0,0x2e]
+; CHECK: bit.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xa0,0x2e]
+; CHECK: bsl.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0x60,0x2e]
+; CHECK: eor.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0x20,0x2e]
+; CHECK: orn.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xe0,0x0e]
+; CHECK: orr.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xa0,0x0e]
+
+  sadalp.4h   v0, v0
+  sadalp.8h  v0, v0
+  sadalp.2s   v0, v0
+  sadalp.4s   v0, v0
+  sadalp.1d   v0, v0
+  sadalp.2d   v0, v0
+
+; CHECK: sadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x0e]
+; CHECK: sadalp.8h	v0, v0          ; encoding: [0x00,0x68,0x20,0x4e]
+; CHECK: sadalp.2s	v0, v0          ; encoding: [0x00,0x68,0x60,0x0e]
+; CHECK: sadalp.4s	v0, v0          ; encoding: [0x00,0x68,0x60,0x4e]
+; CHECK: sadalp.1d	v0, v0          ; encoding: [0x00,0x68,0xa0,0x0e]
+; CHECK: sadalp.2d	v0, v0          ; encoding: [0x00,0x68,0xa0,0x4e]
+
+  cls.8b      v0, v0
+  clz.8b      v0, v0
+  cnt.8b      v0, v0
+  fabs.2s     v0, v0
+  fneg.2s     v0, v0
+  frecpe.2s   v0, v0
+  frinta.2s   v0, v0
+  frintx.2s   v0, v0
+  frinti.2s   v0, v0
+  frintm.2s   v0, v0
+  frintn.2s   v0, v0
+  frintp.2s   v0, v0
+  frintz.2s   v0, v0
+  frsqrte.2s  v0, v0
+  fsqrt.2s    v0, v0
+  neg.8b      v0, v0
+  not.8b      v0, v0
+  rbit.8b     v0, v0
+  rev16.8b    v0, v0
+  rev32.8b    v0, v0
+  rev64.8b    v0, v0
+  sadalp.4h   v0, v0
+  saddlp.4h	  v0, v0
+  scvtf.2s    v0, v0
+  sqabs.8b    v0, v0
+  sqneg.8b    v0, v0
+  sqxtn.8b    v0, v0
+  sqxtun.8b   v0, v0
+  suqadd.8b   v0, v0
+  uadalp.4h   v0, v0
+  uaddlp.4h   v0, v0
+  ucvtf.2s    v0, v0
+  uqxtn.8b    v0, v0
+  urecpe.2s   v0, v0
+  ursqrte.2s  v0, v0
+  usqadd.8b   v0, v0
+  xtn.8b      v0, v0
+  shll.8h v1, v2, #8
+  shll.4s v3, v4, #16
+  shll.2d v5, v6, #32
+  shll2.8h v7, v8, #8
+  shll2.4s v9, v10, #16
+  shll2.2d v11, v12, #32
+  shll v1.8h, v2.8b, #8
+  shll v1.4s, v2.4h, #16
+  shll v1.2d, v2.2s, #32
+  shll2 v1.8h, v2.16b, #8
+  shll2 v1.4s, v2.8h, #16
+  shll2 v1.2d, v2.4s, #32
+
+; CHECK: cls.8b	v0, v0                  ; encoding: [0x00,0x48,0x20,0x0e]
+; CHECK: clz.8b	v0, v0                  ; encoding: [0x00,0x48,0x20,0x2e]
+; CHECK: cnt.8b	v0, v0                  ; encoding: [0x00,0x58,0x20,0x0e]
+; CHECK: fabs.2s	v0, v0                  ; encoding: [0x00,0xf8,0xa0,0x0e]
+; CHECK: fneg.2s	v0, v0                  ; encoding: [0x00,0xf8,0xa0,0x2e]
+; CHECK: frecpe.2s	v0, v0          ; encoding: [0x00,0xd8,0xa1,0x0e]
+; CHECK: frinta.2s	v0, v0          ; encoding: [0x00,0x88,0x21,0x2e]
+; CHECK: frintx.2s	v0, v0          ; encoding: [0x00,0x98,0x21,0x2e]
+; CHECK: frinti.2s	v0, v0          ; encoding: [0x00,0x98,0xa1,0x2e]
+; CHECK: frintm.2s	v0, v0          ; encoding: [0x00,0x98,0x21,0x0e]
+; CHECK: frintn.2s	v0, v0          ; encoding: [0x00,0x88,0x21,0x0e]
+; CHECK: frintp.2s	v0, v0          ; encoding: [0x00,0x88,0xa1,0x0e]
+; CHECK: frintz.2s	v0, v0          ; encoding: [0x00,0x98,0xa1,0x0e]
+; CHECK: frsqrte.2s	v0, v0          ; encoding: [0x00,0xd8,0xa1,0x2e]
+; CHECK: fsqrt.2s	v0, v0          ; encoding: [0x00,0xf8,0xa1,0x2e]
+; CHECK: neg.8b	v0, v0                  ; encoding: [0x00,0xb8,0x20,0x2e]
+; CHECK: not.8b	v0, v0                  ; encoding: [0x00,0x58,0x20,0x2e]
+; CHECK: rbit.8b	v0, v0                  ; encoding: [0x00,0x58,0x60,0x2e]
+; CHECK: rev16.8b	v0, v0          ; encoding: [0x00,0x18,0x20,0x0e]
+; CHECK: rev32.8b	v0, v0          ; encoding: [0x00,0x08,0x20,0x2e]
+; CHECK: rev64.8b	v0, v0          ; encoding: [0x00,0x08,0x20,0x0e]
+; CHECK: sadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x0e]
+; CHECK: saddlp.4h	v0, v0          ; encoding: [0x00,0x28,0x20,0x0e]
+; CHECK: scvtf.2s	v0, v0          ; encoding: [0x00,0xd8,0x21,0x0e]
+; CHECK: sqabs.8b	v0, v0          ; encoding: [0x00,0x78,0x20,0x0e]
+; CHECK: sqneg.8b	v0, v0          ; encoding: [0x00,0x78,0x20,0x2e]
+; CHECK: sqxtn.8b	v0, v0          ; encoding: [0x00,0x48,0x21,0x0e]
+; CHECK: sqxtun.8b	v0, v0          ; encoding: [0x00,0x28,0x21,0x2e]
+; CHECK: suqadd.8b	v0, v0          ; encoding: [0x00,0x38,0x20,0x0e]
+; CHECK: uadalp.4h	v0, v0          ; encoding: [0x00,0x68,0x20,0x2e]
+; CHECK: uaddlp.4h	v0, v0          ; encoding: [0x00,0x28,0x20,0x2e]
+; CHECK: ucvtf.2s	v0, v0          ; encoding: [0x00,0xd8,0x21,0x2e]
+; CHECK: uqxtn.8b	v0, v0          ; encoding: [0x00,0x48,0x21,0x2e]
+; CHECK: urecpe.2s	v0, v0          ; encoding: [0x00,0xc8,0xa1,0x0e]
+; CHECK: ursqrte.2s	v0, v0          ; encoding: [0x00,0xc8,0xa1,0x2e]
+; CHECK: usqadd.8b	v0, v0          ; encoding: [0x00,0x38,0x20,0x2e]
+; CHECK: xtn.8b	v0, v0                  ; encoding: [0x00,0x28,0x21,0x0e]
+; CHECK: shll.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x2e]
+; CHECK: shll.4s	v3, v4, #16     ; encoding: [0x83,0x38,0x61,0x2e]
+; CHECK: shll.2d	v5, v6, #32     ; encoding: [0xc5,0x38,0xa1,0x2e]
+; CHECK: shll2.8h	v7, v8, #8      ; encoding: [0x07,0x39,0x21,0x6e]
+; CHECK: shll2.4s	v9, v10, #16    ; encoding: [0x49,0x39,0x61,0x6e]
+; CHECK: shll2.2d	v11, v12, #32   ; encoding: [0x8b,0x39,0xa1,0x6e]
+; CHECK: shll.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x2e]
+; CHECK: shll.4s	v1, v2, #16     ; encoding: [0x41,0x38,0x61,0x2e]
+; CHECK: shll.2d	v1, v2, #32     ; encoding: [0x41,0x38,0xa1,0x2e]
+; CHECK: shll2.8h	v1, v2, #8      ; encoding: [0x41,0x38,0x21,0x6e]
+; CHECK: shll2.4s	v1, v2, #16     ; encoding: [0x41,0x38,0x61,0x6e]
+; CHECK: shll2.2d	v1, v2, #32     ; encoding: [0x41,0x38,0xa1,0x6e]
+
+
+  cmeq.8b   v0, v0, #0
+  cmeq.16b  v0, v0, #0
+  cmeq.4h   v0, v0, #0
+  cmeq.8h   v0, v0, #0
+  cmeq.2s   v0, v0, #0
+  cmeq.4s   v0, v0, #0
+  cmeq.2d   v0, v0, #0
+
+; CHECK: cmeq.8b	v0, v0, #0              ; encoding: [0x00,0x98,0x20,0x0e]
+; CHECK: cmeq.16b	v0, v0, #0      ; encoding: [0x00,0x98,0x20,0x4e]
+; CHECK: cmeq.4h	v0, v0, #0              ; encoding: [0x00,0x98,0x60,0x0e]
+; CHECK: cmeq.8h	v0, v0, #0              ; encoding: [0x00,0x98,0x60,0x4e]
+; CHECK: cmeq.2s	v0, v0, #0              ; encoding: [0x00,0x98,0xa0,0x0e]
+; CHECK: cmeq.4s	v0, v0, #0              ; encoding: [0x00,0x98,0xa0,0x4e]
+; CHECK: cmeq.2d	v0, v0, #0              ; encoding: [0x00,0x98,0xe0,0x4e]
+
+  cmge.8b   v0, v0, #0
+  cmgt.8b   v0, v0, #0
+  cmle.8b   v0, v0, #0
+  cmlt.8b   v0, v0, #0
+  fcmeq.2s  v0, v0, #0
+  fcmge.2s  v0, v0, #0
+  fcmgt.2s  v0, v0, #0
+  fcmle.2s  v0, v0, #0
+  fcmlt.2s  v0, v0, #0
+
+; ARM verbose mode aliases
+  cmlt v8.8b, v14.8b, #0
+  cmlt v8.16b, v14.16b, #0
+  cmlt v8.4h, v14.4h, #0
+  cmlt v8.8h, v14.8h, #0
+  cmlt v8.2s, v14.2s, #0
+  cmlt v8.4s, v14.4s, #0
+  cmlt v8.2d, v14.2d, #0
+
+; CHECK: cmge.8b	v0, v0, #0              ; encoding: [0x00,0x88,0x20,0x2e]
+; CHECK: cmgt.8b	v0, v0, #0              ; encoding: [0x00,0x88,0x20,0x0e]
+; CHECK: cmle.8b	v0, v0, #0              ; encoding: [0x00,0x98,0x20,0x2e]
+; CHECK: cmlt.8b	v0, v0, #0              ; encoding: [0x00,0xa8,0x20,0x0e]
+; CHECK: fcmeq.2s	v0, v0, #0      ; encoding: [0x00,0xd8,0xa0,0x0e]
+; CHECK: fcmge.2s	v0, v0, #0      ; encoding: [0x00,0xc8,0xa0,0x2e]
+; CHECK: fcmgt.2s	v0, v0, #0      ; encoding: [0x00,0xc8,0xa0,0x0e]
+; CHECK: fcmle.2s	v0, v0, #0      ; encoding: [0x00,0xd8,0xa0,0x2e]
+; CHECK: fcmlt.2s	v0, v0, #0      ; encoding: [0x00,0xe8,0xa0,0x0e]
+; CHECK: cmlt.8b	v8, v14, #0             ; encoding: [0xc8,0xa9,0x20,0x0e]
+; CHECK: cmlt.16b	v8, v14, #0     ; encoding: [0xc8,0xa9,0x20,0x4e]
+; CHECK: cmlt.4h	v8, v14, #0             ; encoding: [0xc8,0xa9,0x60,0x0e]
+; CHECK: cmlt.8h	v8, v14, #0             ; encoding: [0xc8,0xa9,0x60,0x4e]
+; CHECK: cmlt.2s	v8, v14, #0             ; encoding: [0xc8,0xa9,0xa0,0x0e]
+; CHECK: cmlt.4s	v8, v14, #0             ; encoding: [0xc8,0xa9,0xa0,0x4e]
+; CHECK: cmlt.2d	v8, v14, #0             ; encoding: [0xc8,0xa9,0xe0,0x4e]
+
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD Floating-point <-> Integer Conversions
+;===-------------------------------------------------------------------------===
+
+  fcvtas.2s   v0, v0
+  fcvtas.4s   v0, v0
+  fcvtas.2d   v0, v0
+  fcvtas      s0, s0
+  fcvtas      d0, d0
+
+; CHECK: fcvtas.2s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x0e]
+; CHECK: fcvtas.4s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x4e]
+; CHECK: fcvtas.2d  v0, v0           ; encoding: [0x00,0xc8,0x61,0x4e]
+; CHECK: fcvtas     s0, s0           ; encoding: [0x00,0xc8,0x21,0x5e]
+; CHECK: fcvtas     d0, d0           ; encoding: [0x00,0xc8,0x61,0x5e]
+
+  fcvtau.2s   v0, v0
+  fcvtau.4s   v0, v0
+  fcvtau.2d   v0, v0
+  fcvtau      s0, s0
+  fcvtau      d0, d0
+
+; CHECK: fcvtau.2s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x2e]
+; CHECK: fcvtau.4s  v0, v0           ; encoding: [0x00,0xc8,0x21,0x6e]
+; CHECK: fcvtau.2d  v0, v0           ; encoding: [0x00,0xc8,0x61,0x6e]
+; CHECK: fcvtau     s0, s0           ; encoding: [0x00,0xc8,0x21,0x7e]
+; CHECK: fcvtau     d0, d0           ; encoding: [0x00,0xc8,0x61,0x7e]
+
+  fcvtl   v1.4s, v5.4h
+  fcvtl   v2.2d, v6.2s
+  fcvtl2  v3.4s, v7.8h
+  fcvtl2  v4.2d, v8.4s
+
+; CHECK: fcvtl	v1.4s, v5.4h            ; encoding: [0xa1,0x78,0x21,0x0e]
+; CHECK: fcvtl	v2.2d, v6.2s            ; encoding: [0xc2,0x78,0x61,0x0e]
+; CHECK: fcvtl2	v3.4s, v7.8h            ; encoding: [0xe3,0x78,0x21,0x4e]
+; CHECK: fcvtl2	v4.2d, v8.4s            ; encoding: [0x04,0x79,0x61,0x4e]
+
+  fcvtms.2s  v0, v0
+  fcvtms.4s  v0, v0
+  fcvtms.2d  v0, v0
+  fcvtms     s0, s0
+  fcvtms     d0, d0
+
+; CHECK: fcvtms.2s v0, v0            ; encoding: [0x00,0xb8,0x21,0x0e]
+; CHECK: fcvtms.4s v0, v0            ; encoding: [0x00,0xb8,0x21,0x4e]
+; CHECK: fcvtms.2d v0, v0            ; encoding: [0x00,0xb8,0x61,0x4e]
+; CHECK: fcvtms    s0, s0            ; encoding: [0x00,0xb8,0x21,0x5e]
+; CHECK: fcvtms    d0, d0            ; encoding: [0x00,0xb8,0x61,0x5e]
+
+  fcvtmu.2s   v0, v0
+  fcvtmu.4s   v0, v0
+  fcvtmu.2d   v0, v0
+  fcvtmu      s0, s0
+  fcvtmu      d0, d0
+
+; CHECK: fcvtmu.2s v0, v0            ; encoding: [0x00,0xb8,0x21,0x2e]
+; CHECK: fcvtmu.4s v0, v0            ; encoding: [0x00,0xb8,0x21,0x6e]
+; CHECK: fcvtmu.2d v0, v0            ; encoding: [0x00,0xb8,0x61,0x6e]
+; CHECK: fcvtmu    s0, s0            ; encoding: [0x00,0xb8,0x21,0x7e]
+; CHECK: fcvtmu    d0, d0            ; encoding: [0x00,0xb8,0x61,0x7e]
+
+  fcvtns.2s   v0, v0
+  fcvtns.4s   v0, v0
+  fcvtns.2d   v0, v0
+  fcvtns      s0, s0
+  fcvtns      d0, d0
+
+; CHECK: fcvtns.2s v0, v0            ; encoding: [0x00,0xa8,0x21,0x0e]
+; CHECK: fcvtns.4s v0, v0            ; encoding: [0x00,0xa8,0x21,0x4e]
+; CHECK: fcvtns.2d v0, v0            ; encoding: [0x00,0xa8,0x61,0x4e]
+; CHECK: fcvtns    s0, s0            ; encoding: [0x00,0xa8,0x21,0x5e]
+; CHECK: fcvtns    d0, d0            ; encoding: [0x00,0xa8,0x61,0x5e]
+
+  fcvtnu.2s   v0, v0
+  fcvtnu.4s   v0, v0
+  fcvtnu.2d   v0, v0
+  fcvtnu      s0, s0
+  fcvtnu      d0, d0
+
+; CHECK: fcvtnu.2s v0, v0            ; encoding: [0x00,0xa8,0x21,0x2e]
+; CHECK: fcvtnu.4s v0, v0            ; encoding: [0x00,0xa8,0x21,0x6e]
+; CHECK: fcvtnu.2d v0, v0            ; encoding: [0x00,0xa8,0x61,0x6e]
+; CHECK: fcvtnu    s0, s0            ; encoding: [0x00,0xa8,0x21,0x7e]
+; CHECK: fcvtnu    d0, d0            ; encoding: [0x00,0xa8,0x61,0x7e]
+
+  fcvtn   v2.4h, v4.4s
+  fcvtn   v3.2s, v5.2d
+  fcvtn2  v4.8h, v6.4s
+  fcvtn2  v5.4s, v7.2d
+  fcvtxn  v6.2s, v9.2d
+  fcvtxn2 v7.4s, v8.2d
+
+; CHECK: fcvtn	v2.4h, v4.4s            ; encoding: [0x82,0x68,0x21,0x0e]
+; CHECK: fcvtn	v3.2s, v5.2d            ; encoding: [0xa3,0x68,0x61,0x0e]
+; CHECK: fcvtn2	v4.8h, v6.4s            ; encoding: [0xc4,0x68,0x21,0x4e]
+; CHECK: fcvtn2	v5.4s, v7.2d            ; encoding: [0xe5,0x68,0x61,0x4e]
+; CHECK: fcvtxn	v6.2s, v9.2d            ; encoding: [0x26,0x69,0x61,0x2e]
+; CHECK: fcvtxn2 v7.4s, v8.2d           ; encoding: [0x07,0x69,0x61,0x6e]
+
+  fcvtps.2s  v0, v0
+  fcvtps.4s  v0, v0
+  fcvtps.2d  v0, v0
+  fcvtps     s0, s0
+  fcvtps     d0, d0
+
+; CHECK: fcvtps.2s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x0e]
+; CHECK: fcvtps.4s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x4e]
+; CHECK: fcvtps.2d v0, v0            ; encoding: [0x00,0xa8,0xe1,0x4e]
+; CHECK: fcvtps    s0, s0            ; encoding: [0x00,0xa8,0xa1,0x5e]
+; CHECK: fcvtps    d0, d0            ; encoding: [0x00,0xa8,0xe1,0x5e]
+
+  fcvtpu.2s  v0, v0
+  fcvtpu.4s  v0, v0
+  fcvtpu.2d  v0, v0
+  fcvtpu     s0, s0
+  fcvtpu     d0, d0
+
+; CHECK: fcvtpu.2s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x2e]
+; CHECK: fcvtpu.4s v0, v0            ; encoding: [0x00,0xa8,0xa1,0x6e]
+; CHECK: fcvtpu.2d v0, v0            ; encoding: [0x00,0xa8,0xe1,0x6e]
+; CHECK: fcvtpu    s0, s0            ; encoding: [0x00,0xa8,0xa1,0x7e]
+; CHECK: fcvtpu    d0, d0            ; encoding: [0x00,0xa8,0xe1,0x7e]
+
+  fcvtzs.2s  v0, v0
+  fcvtzs.4s  v0, v0
+  fcvtzs.2d  v0, v0
+  fcvtzs     s0, s0
+  fcvtzs     d0, d0
+
+; CHECK: fcvtzs.2s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x0e]
+; CHECK: fcvtzs.4s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x4e]
+; CHECK: fcvtzs.2d v0, v0            ; encoding: [0x00,0xb8,0xe1,0x4e]
+; CHECK: fcvtzs    s0, s0            ; encoding: [0x00,0xb8,0xa1,0x5e]
+; CHECK: fcvtzs    d0, d0            ; encoding: [0x00,0xb8,0xe1,0x5e]
+
+  fcvtzu.2s  v0, v0
+  fcvtzu.4s  v0, v0
+  fcvtzu.2d  v0, v0
+  fcvtzu     s0, s0
+  fcvtzu     d0, d0
+
+; CHECK: fcvtzu.2s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x2e]
+; CHECK: fcvtzu.4s v0, v0            ; encoding: [0x00,0xb8,0xa1,0x6e]
+; CHECK: fcvtzu.2d v0, v0            ; encoding: [0x00,0xb8,0xe1,0x6e]
+; CHECK: fcvtzu    s0, s0            ; encoding: [0x00,0xb8,0xa1,0x7e]
+; CHECK: fcvtzu    d0, d0            ; encoding: [0x00,0xb8,0xe1,0x7e]
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD modified immediate instructions
+;===-------------------------------------------------------------------------===
+
+  bic.2s  v0, #1
+  bic.2s  v0, #1, lsl #0
+  bic.2s  v0, #1, lsl #8
+  bic.2s  v0, #1, lsl #16
+  bic.2s  v0, #1, lsl #24
+
+; CHECK: bic.2s v0, #1               ; encoding: [0x20,0x14,0x00,0x2f]
+; CHECK: bic.2s v0, #1               ; encoding: [0x20,0x14,0x00,0x2f]
+; CHECK: bic.2s v0, #1, lsl #8       ; encoding: [0x20,0x34,0x00,0x2f]
+; CHECK: bic.2s v0, #1, lsl #16      ; encoding: [0x20,0x54,0x00,0x2f]
+; CHECK: bic.2s v0, #1, lsl #24      ; encoding: [0x20,0x74,0x00,0x2f]
+
+  bic.4h  v0, #1
+  bic.4h  v0, #1, lsl #0
+  bic.4h  v0, #1, lsl #8
+
+; CHECK: bic.4h v0, #1               ; encoding: [0x20,0x94,0x00,0x2f]
+; CHECK: bic.4h v0, #1               ; encoding: [0x20,0x94,0x00,0x2f]
+; CHECK: bic.4h v0, #1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x2f]
+
+  bic.4s  v0, #1
+  bic.4s  v0, #1, lsl #0
+  bic.4s  v0, #1, lsl #8
+  bic.4s  v0, #1, lsl #16
+  bic.4s  v0, #1, lsl #24
+
+; CHECK: bic.4s v0, #1               ; encoding: [0x20,0x14,0x00,0x6f]
+; CHECK: bic.4s v0, #1               ; encoding: [0x20,0x14,0x00,0x6f]
+; CHECK: bic.4s v0, #1, lsl #8       ; encoding: [0x20,0x34,0x00,0x6f]
+; CHECK: bic.4s v0, #1, lsl #16      ; encoding: [0x20,0x54,0x00,0x6f]
+; CHECK: bic.4s v0, #1, lsl #24      ; encoding: [0x20,0x74,0x00,0x6f]
+
+  bic.8h  v0, #1
+  bic.8h  v0, #1, lsl #0
+  bic.8h  v0, #1, lsl #8
+
+; CHECK: bic.8h v0, #1               ; encoding: [0x20,0x94,0x00,0x6f]
+; CHECK: bic.8h v0, #1               ; encoding: [0x20,0x94,0x00,0x6f]
+; CHECK: bic.8h v0, #1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x6f]
+
+  fmov.2d v0, #1.250000e-01
+
+; CHECK: fmov.2d v0, #1.250000e-01             ; encoding: [0x00,0xf4,0x02,0x6f]
+
+  fmov.2s v0, #1.250000e-01
+  fmov.4s v0, #1.250000e-01
+
+; CHECK: fmov.2s v0, #1.250000e-01             ; encoding: [0x00,0xf4,0x02,0x0f]
+; CHECK: fmov.4s v0, #1.250000e-01             ; encoding: [0x00,0xf4,0x02,0x4f]
+
+  orr.2s  v0, #1
+  orr.2s  v0, #1, lsl #0
+  orr.2s  v0, #1, lsl #8
+  orr.2s  v0, #1, lsl #16
+  orr.2s  v0, #1, lsl #24
+
+; CHECK: orr.2s v0, #1               ; encoding: [0x20,0x14,0x00,0x0f]
+; CHECK: orr.2s v0, #1               ; encoding: [0x20,0x14,0x00,0x0f]
+; CHECK: orr.2s v0, #1, lsl #8       ; encoding: [0x20,0x34,0x00,0x0f]
+; CHECK: orr.2s v0, #1, lsl #16      ; encoding: [0x20,0x54,0x00,0x0f]
+; CHECK: orr.2s v0, #1, lsl #24      ; encoding: [0x20,0x74,0x00,0x0f]
+
+  orr.4h  v0, #1
+  orr.4h  v0, #1, lsl #0
+  orr.4h  v0, #1, lsl #8
+
+; CHECK: orr.4h v0, #1               ; encoding: [0x20,0x94,0x00,0x0f]
+; CHECK: orr.4h v0, #1               ; encoding: [0x20,0x94,0x00,0x0f]
+; CHECK: orr.4h v0, #1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x0f]
+
+  orr.4s  v0, #1
+  orr.4s  v0, #1, lsl #0
+  orr.4s  v0, #1, lsl #8
+  orr.4s  v0, #1, lsl #16
+  orr.4s  v0, #1, lsl #24
+
+; CHECK: orr.4s v0, #1               ; encoding: [0x20,0x14,0x00,0x4f]
+; CHECK: orr.4s v0, #1               ; encoding: [0x20,0x14,0x00,0x4f]
+; CHECK: orr.4s v0, #1, lsl #8       ; encoding: [0x20,0x34,0x00,0x4f]
+; CHECK: orr.4s v0, #1, lsl #16      ; encoding: [0x20,0x54,0x00,0x4f]
+; CHECK: orr.4s v0, #1, lsl #24      ; encoding: [0x20,0x74,0x00,0x4f]
+
+  orr.8h  v0, #1
+  orr.8h  v0, #1, lsl #0
+  orr.8h  v0, #1, lsl #8
+
+; CHECK: orr.8h v0, #1               ; encoding: [0x20,0x94,0x00,0x4f]
+; CHECK: orr.8h v0, #1               ; encoding: [0x20,0x94,0x00,0x4f]
+; CHECK: orr.8h v0, #1, lsl #8       ; encoding: [0x20,0xb4,0x00,0x4f]
+
+  movi     d0, #0x000000000000ff
+  movi.2d  v0, #0x000000000000ff
+
+; CHECK: movi     d0, #0x000000000000ff ; encoding: [0x20,0xe4,0x00,0x2f]
+; CHECK: movi.2d  v0, #0x000000000000ff ; encoding: [0x20,0xe4,0x00,0x6f]
+
+  movi.2s v0, #1
+  movi.2s v0, #1, lsl #0
+  movi.2s v0, #1, lsl #8
+  movi.2s v0, #1, lsl #16
+  movi.2s v0, #1, lsl #24
+
+; CHECK: movi.2s v0, #1              ; encoding: [0x20,0x04,0x00,0x0f]
+; CHECK: movi.2s v0, #1              ; encoding: [0x20,0x04,0x00,0x0f]
+; CHECK: movi.2s v0, #1, lsl #8      ; encoding: [0x20,0x24,0x00,0x0f]
+; CHECK: movi.2s v0, #1, lsl #16     ; encoding: [0x20,0x44,0x00,0x0f]
+; CHECK: movi.2s v0, #1, lsl #24     ; encoding: [0x20,0x64,0x00,0x0f]
+
+  movi.4s v0, #1
+  movi.4s v0, #1, lsl #0
+  movi.4s v0, #1, lsl #8
+  movi.4s v0, #1, lsl #16
+  movi.4s v0, #1, lsl #24
+
+; CHECK: movi.4s v0, #1              ; encoding: [0x20,0x04,0x00,0x4f]
+; CHECK: movi.4s v0, #1              ; encoding: [0x20,0x04,0x00,0x4f]
+; CHECK: movi.4s v0, #1, lsl #8      ; encoding: [0x20,0x24,0x00,0x4f]
+; CHECK: movi.4s v0, #1, lsl #16     ; encoding: [0x20,0x44,0x00,0x4f]
+; CHECK: movi.4s v0, #1, lsl #24     ; encoding: [0x20,0x64,0x00,0x4f]
+
+  movi.4h v0, #1
+  movi.4h v0, #1, lsl #0
+  movi.4h v0, #1, lsl #8
+
+; CHECK: movi.4h v0, #1              ; encoding: [0x20,0x84,0x00,0x0f]
+; CHECK: movi.4h v0, #1              ; encoding: [0x20,0x84,0x00,0x0f]
+; CHECK: movi.4h v0, #1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x0f]
+
+  movi.8h v0, #1
+  movi.8h v0, #1, lsl #0
+  movi.8h v0, #1, lsl #8
+
+; CHECK: movi.8h v0, #1              ; encoding: [0x20,0x84,0x00,0x4f]
+; CHECK: movi.8h v0, #1              ; encoding: [0x20,0x84,0x00,0x4f]
+; CHECK: movi.8h v0, #1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x4f]
+
+  movi.2s v0, #1, msl #8
+  movi.2s v0, #1, msl #16
+  movi.4s v0, #1, msl #8
+  movi.4s v0, #1, msl #16
+
+; CHECK: movi.2s v0, #1, msl #8      ; encoding: [0x20,0xc4,0x00,0x0f]
+; CHECK: movi.2s v0, #1, msl #16     ; encoding: [0x20,0xd4,0x00,0x0f]
+; CHECK: movi.4s v0, #1, msl #8      ; encoding: [0x20,0xc4,0x00,0x4f]
+; CHECK: movi.4s v0, #1, msl #16     ; encoding: [0x20,0xd4,0x00,0x4f]
+
+  movi.8b  v0, #1
+  movi.16b v0, #1
+
+; CHECK: movi.8b  v0, #1             ; encoding: [0x20,0xe4,0x00,0x0f]
+; CHECK: movi.16b v0, #1             ; encoding: [0x20,0xe4,0x00,0x4f]
+
+  mvni.2s v0, #1
+  mvni.2s v0, #1, lsl #0
+  mvni.2s v0, #1, lsl #8
+  mvni.2s v0, #1, lsl #16
+  mvni.2s v0, #1, lsl #24
+
+; CHECK: mvni.2s v0, #1              ; encoding: [0x20,0x04,0x00,0x2f]
+; CHECK: mvni.2s v0, #1              ; encoding: [0x20,0x04,0x00,0x2f]
+; CHECK: mvni.2s v0, #1, lsl #8      ; encoding: [0x20,0x24,0x00,0x2f]
+; CHECK: mvni.2s v0, #1, lsl #16     ; encoding: [0x20,0x44,0x00,0x2f]
+; CHECK: mvni.2s v0, #1, lsl #24     ; encoding: [0x20,0x64,0x00,0x2f]
+
+  mvni.4s v0, #1
+  mvni.4s v0, #1, lsl #0
+  mvni.4s v0, #1, lsl #8
+  mvni.4s v0, #1, lsl #16
+  mvni.4s v0, #1, lsl #24
+
+; CHECK: mvni.4s v0, #1              ; encoding: [0x20,0x04,0x00,0x6f]
+; CHECK: mvni.4s v0, #1              ; encoding: [0x20,0x04,0x00,0x6f]
+; CHECK: mvni.4s v0, #1, lsl #8      ; encoding: [0x20,0x24,0x00,0x6f]
+; CHECK: mvni.4s v0, #1, lsl #16     ; encoding: [0x20,0x44,0x00,0x6f]
+; CHECK: mvni.4s v0, #1, lsl #24     ; encoding: [0x20,0x64,0x00,0x6f]
+
+  mvni.4h v0, #1
+  mvni.4h v0, #1, lsl #0
+  mvni.4h v0, #1, lsl #8
+
+; CHECK: mvni.4h v0, #1              ; encoding: [0x20,0x84,0x00,0x2f]
+; CHECK: mvni.4h v0, #1              ; encoding: [0x20,0x84,0x00,0x2f]
+; CHECK: mvni.4h v0, #1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x2f]
+
+  mvni.8h v0, #1
+  mvni.8h v0, #1, lsl #0
+  mvni.8h v0, #1, lsl #8
+
+; CHECK: mvni.8h v0, #1              ; encoding: [0x20,0x84,0x00,0x6f]
+; CHECK: mvni.8h v0, #1              ; encoding: [0x20,0x84,0x00,0x6f]
+; CHECK: mvni.8h v0, #1, lsl #8      ; encoding: [0x20,0xa4,0x00,0x6f]
+
+  mvni.2s v0, #1, msl #8
+  mvni.2s v0, #1, msl #16
+  mvni.4s v0, #1, msl #8
+  mvni.4s v0, #1, msl #16
+
+; CHECK: mvni.2s v0, #1, msl #8      ; encoding: [0x20,0xc4,0x00,0x2f]
+; CHECK: mvni.2s v0, #1, msl #16     ; encoding: [0x20,0xd4,0x00,0x2f]
+; CHECK: mvni.4s v0, #1, msl #8      ; encoding: [0x20,0xc4,0x00,0x6f]
+; CHECK: mvni.4s v0, #1, msl #16     ; encoding: [0x20,0xd4,0x00,0x6f]
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD scalar x index
+;===-------------------------------------------------------------------------===
+
+  fmla.s  s0, s0, v0[3]
+  fmla.d  d0, d0, v0[1]
+  fmls.s  s0, s0, v0[3]
+  fmls.d  d0, d0, v0[1]
+  fmulx.s s0, s0, v0[3]
+  fmulx.d d0, d0, v0[1]
+  fmul.s  s0, s0, v0[3]
+  fmul.d  d0, d0, v0[1]
+  sqdmlal.h s0, h0, v0[7]
+  sqdmlal.s d0, s0, v0[3]
+  sqdmlsl.h s0, h0, v0[7]
+  sqdmulh.h h0, h0, v0[7]
+  sqdmulh.s s0, s0, v0[3]
+  sqdmull.h s0, h0, v0[7]
+  sqdmull.s d0, s0, v0[3]
+  sqrdmulh.h  h0, h0, v0[7]
+  sqrdmulh.s  s0, s0, v0[3]
+
+; CHECK: fmla.s	s0, s0, v0[3]           ; encoding: [0x00,0x18,0xa0,0x5f]
+; CHECK: fmla.d	d0, d0, v0[1]           ; encoding: [0x00,0x18,0xc0,0x5f]
+; CHECK: fmls.s	s0, s0, v0[3]           ; encoding: [0x00,0x58,0xa0,0x5f]
+; CHECK: fmls.d	d0, d0, v0[1]           ; encoding: [0x00,0x58,0xc0,0x5f]
+; CHECK: fmulx.s	s0, s0, v0[3]           ; encoding: [0x00,0x98,0xa0,0x7f]
+; CHECK: fmulx.d	d0, d0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x7f]
+; CHECK: fmul.s	s0, s0, v0[3]           ; encoding: [0x00,0x98,0xa0,0x5f]
+; CHECK: fmul.d	d0, d0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x5f]
+; CHECK: sqdmlal.h	s0, h0, v0[7]   ; encoding: [0x00,0x38,0x70,0x5f]
+; CHECK: sqdmlal.s	d0, s0, v0[3]   ; encoding: [0x00,0x38,0xa0,0x5f]
+; CHECK: sqdmlsl.h	s0, h0, v0[7]   ; encoding: [0x00,0x78,0x70,0x5f]
+; CHECK: sqdmulh.h	h0, h0, v0[7]   ; encoding: [0x00,0xc8,0x70,0x5f]
+; CHECK: sqdmulh.s	s0, s0, v0[3]   ; encoding: [0x00,0xc8,0xa0,0x5f]
+; CHECK: sqdmull.h	s0, h0, v0[7]   ; encoding: [0x00,0xb8,0x70,0x5f]
+; CHECK: sqdmull.s	d0, s0, v0[3]   ; encoding: [0x00,0xb8,0xa0,0x5f]
+; CHECK: sqrdmulh.h	h0, h0, v0[7]   ; encoding: [0x00,0xd8,0x70,0x5f]
+; CHECK: sqrdmulh.s	s0, s0, v0[3]   ; encoding: [0x00,0xd8,0xa0,0x5f]
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD SMLAL
+;===-------------------------------------------------------------------------===
+        smlal.8h v1, v2, v3
+        smlal.4s v1, v2, v3
+        smlal.2d v1, v2, v3
+        smlal2.8h v1, v2, v3
+        smlal2.4s v1, v2, v3
+        smlal2.2d v1, v2, v3
+
+        smlal v13.8h, v8.8b, v0.8b
+        smlal v13.4s, v8.4h, v0.4h
+        smlal v13.2d, v8.2s, v0.2s
+        smlal2 v13.8h, v8.16b, v0.16b
+        smlal2 v13.4s, v8.8h, v0.8h
+        smlal2 v13.2d, v8.4s, v0.4s
+
+; CHECK: smlal.8h	v1, v2, v3      ; encoding: [0x41,0x80,0x23,0x0e]
+; CHECK: smlal.4s	v1, v2, v3      ; encoding: [0x41,0x80,0x63,0x0e]
+; CHECK: smlal.2d	v1, v2, v3      ; encoding: [0x41,0x80,0xa3,0x0e]
+; CHECK: smlal2.8h	v1, v2, v3      ; encoding: [0x41,0x80,0x23,0x4e]
+; CHECK: smlal2.4s	v1, v2, v3      ; encoding: [0x41,0x80,0x63,0x4e]
+; CHECK: smlal2.2d	v1, v2, v3      ; encoding: [0x41,0x80,0xa3,0x4e]
+; CHECK: smlal.8h	v13, v8, v0     ; encoding: [0x0d,0x81,0x20,0x0e]
+; CHECK: smlal.4s	v13, v8, v0     ; encoding: [0x0d,0x81,0x60,0x0e]
+; CHECK: smlal.2d	v13, v8, v0     ; encoding: [0x0d,0x81,0xa0,0x0e]
+; CHECK: smlal2.8h	v13, v8, v0     ; encoding: [0x0d,0x81,0x20,0x4e]
+; CHECK: smlal2.4s	v13, v8, v0     ; encoding: [0x0d,0x81,0x60,0x4e]
+; CHECK: smlal2.2d	v13, v8, v0     ; encoding: [0x0d,0x81,0xa0,0x4e]
+
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD scalar x index
+;===-------------------------------------------------------------------------===
+
+  fmla.2s v0, v0, v0[0]
+  fmla.4s v0, v0, v0[1]
+  fmla.2d v0, v0, v0[1]
+  fmls.2s v0, v0, v0[0]
+  fmls.4s v0, v0, v0[1]
+  fmls.2d v0, v0, v0[1]
+  fmulx.2s  v0, v0, v0[0]
+  fmulx.4s  v0, v0, v0[1]
+  fmulx.2d  v0, v0, v0[1]
+  fmul.2s v0, v0, v0[0]
+  fmul.4s v0, v0, v0[1]
+  fmul.2d v0, v0, v0[1]
+  mla.4h  v0, v0, v0[0]
+  mla.8h  v0, v0, v0[1]
+  mla.2s  v0, v0, v0[2]
+  mla.4s  v0, v0, v0[3]
+  mls.4h  v0, v0, v0[0]
+  mls.8h  v0, v0, v0[1]
+  mls.2s  v0, v0, v0[2]
+  mls.4s  v0, v0, v0[3]
+  mul.4h  v0, v0, v0[0]
+  mul.8h  v0, v0, v0[1]
+  mul.2s  v0, v0, v0[2]
+  mul.4s  v0, v0, v0[3]
+  smlal.4s  v0, v0, v0[0]
+  smlal2.4s v0, v0, v0[1]
+  smlal.2d  v0, v0, v0[2]
+  smlal2.2d v0, v0, v0[3]
+  smlsl.4s  v0, v0, v0[0]
+  smlsl2.4s v0, v0, v0[1]
+  smlsl.2d  v0, v0, v0[2]
+  smlsl2.2d v0, v0, v0[3]
+  smull.4s  v0, v0, v0[0]
+  smull2.4s v0, v0, v0[1]
+  smull.2d  v0, v0, v0[2]
+  smull2.2d v0, v0, v0[3]
+  sqdmlal.4s  v0, v0, v0[0]
+  sqdmlal2.4s v0, v0, v0[1]
+  sqdmlal.2d  v0, v0, v0[2]
+  sqdmlal2.2d v0, v0, v0[3]
+  sqdmlsl.4s  v0, v0, v0[0]
+  sqdmlsl2.4s v0, v0, v0[1]
+  sqdmlsl.2d  v0, v0, v0[2]
+  sqdmlsl2.2d v0, v0, v0[3]
+  sqdmulh.4h  v0, v0, v0[0]
+  sqdmulh.8h  v0, v0, v0[1]
+  sqdmulh.2s  v0, v0, v0[2]
+  sqdmulh.4s  v0, v0, v0[3]
+  sqdmull.4s  v0, v0, v0[0]
+  sqdmull2.4s v0, v0, v0[1]
+  sqdmull.2d  v0, v0, v0[2]
+  sqdmull2.2d v0, v0, v0[3]
+  sqrdmulh.4h v0, v0, v0[0]
+  sqrdmulh.8h v0, v0, v0[1]
+  sqrdmulh.2s v0, v0, v0[2]
+  sqrdmulh.4s v0, v0, v0[3]
+  umlal.4s  v0, v0, v0[0]
+  umlal2.4s v0, v0, v0[1]
+  umlal.2d  v0, v0, v0[2]
+  umlal2.2d v0, v0, v0[3]
+  umlsl.4s  v0, v0, v0[0]
+  umlsl2.4s v0, v0, v0[1]
+  umlsl.2d  v0, v0, v0[2]
+  umlsl2.2d v0, v0, v0[3]
+  umull.4s  v0, v0, v0[0]
+  umull2.4s v0, v0, v0[1]
+  umull.2d  v0, v0, v0[2]
+  umull2.2d v0, v0, v0[3]
+
+; CHECK: fmla.2s	v0, v0, v0[0]           ; encoding: [0x00,0x10,0x80,0x0f]
+; CHECK: fmla.4s	v0, v0, v0[1]           ; encoding: [0x00,0x10,0xa0,0x4f]
+; CHECK: fmla.2d	v0, v0, v0[1]           ; encoding: [0x00,0x18,0xc0,0x4f]
+; CHECK: fmls.2s	v0, v0, v0[0]           ; encoding: [0x00,0x50,0x80,0x0f]
+; CHECK: fmls.4s	v0, v0, v0[1]           ; encoding: [0x00,0x50,0xa0,0x4f]
+; CHECK: fmls.2d	v0, v0, v0[1]           ; encoding: [0x00,0x58,0xc0,0x4f]
+; CHECK: fmulx.2s	v0, v0, v0[0]   ; encoding: [0x00,0x90,0x80,0x2f]
+; CHECK: fmulx.4s	v0, v0, v0[1]   ; encoding: [0x00,0x90,0xa0,0x6f]
+; CHECK: fmulx.2d	v0, v0, v0[1]   ; encoding: [0x00,0x98,0xc0,0x6f]
+; CHECK: fmul.2s	v0, v0, v0[0]           ; encoding: [0x00,0x90,0x80,0x0f]
+; CHECK: fmul.4s	v0, v0, v0[1]           ; encoding: [0x00,0x90,0xa0,0x4f]
+; CHECK: fmul.2d	v0, v0, v0[1]           ; encoding: [0x00,0x98,0xc0,0x4f]
+; CHECK: mla.4h	v0, v0, v0[0]           ; encoding: [0x00,0x00,0x40,0x2f]
+; CHECK: mla.8h	v0, v0, v0[1]           ; encoding: [0x00,0x00,0x50,0x6f]
+; CHECK: mla.2s	v0, v0, v0[2]           ; encoding: [0x00,0x08,0x80,0x2f]
+; CHECK: mla.4s	v0, v0, v0[3]           ; encoding: [0x00,0x08,0xa0,0x6f]
+; CHECK: mls.4h	v0, v0, v0[0]           ; encoding: [0x00,0x40,0x40,0x2f]
+; CHECK: mls.8h	v0, v0, v0[1]           ; encoding: [0x00,0x40,0x50,0x6f]
+; CHECK: mls.2s	v0, v0, v0[2]           ; encoding: [0x00,0x48,0x80,0x2f]
+; CHECK: mls.4s	v0, v0, v0[3]           ; encoding: [0x00,0x48,0xa0,0x6f]
+; CHECK: mul.4h	v0, v0, v0[0]           ; encoding: [0x00,0x80,0x40,0x0f]
+; CHECK: mul.8h	v0, v0, v0[1]           ; encoding: [0x00,0x80,0x50,0x4f]
+; CHECK: mul.2s	v0, v0, v0[2]           ; encoding: [0x00,0x88,0x80,0x0f]
+; CHECK: mul.4s	v0, v0, v0[3]           ; encoding: [0x00,0x88,0xa0,0x4f]
+; CHECK: smlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x20,0x40,0x0f]
+; CHECK: smlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x20,0x50,0x4f]
+; CHECK: smlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x28,0x80,0x0f]
+; CHECK: smlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x28,0xa0,0x4f]
+; CHECK: smlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x60,0x40,0x0f]
+; CHECK: smlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x60,0x50,0x4f]
+; CHECK: smlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x68,0x80,0x0f]
+; CHECK: smlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x68,0xa0,0x4f]
+; CHECK: smull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xa0,0x40,0x0f]
+; CHECK: smull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xa0,0x50,0x4f]
+; CHECK: smull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xa8,0x80,0x0f]
+; CHECK: smull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xa8,0xa0,0x4f]
+; CHECK: sqdmlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x30,0x40,0x0f]
+; CHECK: sqdmlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x30,0x50,0x4f]
+; CHECK: sqdmlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x38,0x80,0x0f]
+; CHECK: sqdmlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x38,0xa0,0x4f]
+; CHECK: sqdmlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x70,0x40,0x0f]
+; CHECK: sqdmlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x70,0x50,0x4f]
+; CHECK: sqdmlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x78,0x80,0x0f]
+; CHECK: sqdmlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x78,0xa0,0x4f]
+; CHECK: sqdmulh.4h	v0, v0, v0[0]   ; encoding: [0x00,0xc0,0x40,0x0f]
+; CHECK: sqdmulh.8h	v0, v0, v0[1]   ; encoding: [0x00,0xc0,0x50,0x4f]
+; CHECK: sqdmulh.2s	v0, v0, v0[2]   ; encoding: [0x00,0xc8,0x80,0x0f]
+; CHECK: sqdmulh.4s	v0, v0, v0[3]   ; encoding: [0x00,0xc8,0xa0,0x4f]
+; CHECK: sqdmull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xb0,0x40,0x0f]
+; CHECK: sqdmull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xb0,0x50,0x4f]
+; CHECK: sqdmull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xb8,0x80,0x0f]
+; CHECK: sqdmull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xb8,0xa0,0x4f]
+; CHECK: sqrdmulh.4h	v0, v0, v0[0]   ; encoding: [0x00,0xd0,0x40,0x0f]
+; CHECK: sqrdmulh.8h	v0, v0, v0[1]   ; encoding: [0x00,0xd0,0x50,0x4f]
+; CHECK: sqrdmulh.2s	v0, v0, v0[2]   ; encoding: [0x00,0xd8,0x80,0x0f]
+; CHECK: sqrdmulh.4s	v0, v0, v0[3]   ; encoding: [0x00,0xd8,0xa0,0x4f]
+; CHECK: umlal.4s	v0, v0, v0[0]   ; encoding: [0x00,0x20,0x40,0x2f]
+; CHECK: umlal2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x20,0x50,0x6f]
+; CHECK: umlal.2d	v0, v0, v0[2]   ; encoding: [0x00,0x28,0x80,0x2f]
+; CHECK: umlal2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x28,0xa0,0x6f]
+; CHECK: umlsl.4s	v0, v0, v0[0]   ; encoding: [0x00,0x60,0x40,0x2f]
+; CHECK: umlsl2.4s	v0, v0, v0[1]   ; encoding: [0x00,0x60,0x50,0x6f]
+; CHECK: umlsl.2d	v0, v0, v0[2]   ; encoding: [0x00,0x68,0x80,0x2f]
+; CHECK: umlsl2.2d	v0, v0, v0[3]   ; encoding: [0x00,0x68,0xa0,0x6f]
+; CHECK: umull.4s	v0, v0, v0[0]   ; encoding: [0x00,0xa0,0x40,0x2f]
+; CHECK: umull2.4s	v0, v0, v0[1]   ; encoding: [0x00,0xa0,0x50,0x6f]
+; CHECK: umull.2d	v0, v0, v0[2]   ; encoding: [0x00,0xa8,0x80,0x2f]
+; CHECK: umull2.2d	v0, v0, v0[3]   ; encoding: [0x00,0xa8,0xa0,0x6f]
+
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD scalar with shift
+;===-------------------------------------------------------------------------===
+
+  fcvtzs s0, s0, #1
+  fcvtzs d0, d0, #2
+  fcvtzu s0, s0, #1
+  fcvtzu d0, d0, #2
+  shl    d0, d0, #1
+  sli    d0, d0, #1
+  sqrshrn b0, h0, #1
+  sqrshrn h0, s0, #2
+  sqrshrn s0, d0, #3
+  sqrshrun b0, h0, #1
+  sqrshrun h0, s0, #2
+  sqrshrun s0, d0, #3
+  sqshlu  b0, b0, #1
+  sqshlu  h0, h0, #2
+  sqshlu  s0, s0, #3
+  sqshlu  d0, d0, #4
+  sqshl   b0, b0, #1
+  sqshl   h0, h0, #2
+  sqshl   s0, s0, #3
+  sqshl   d0, d0, #4
+  sqshrn  b0, h0, #1
+  sqshrn  h0, s0, #2
+  sqshrn  s0, d0, #3
+  sqshrun b0, h0, #1
+  sqshrun h0, s0, #2
+  sqshrun s0, d0, #3
+  sri     d0, d0, #1
+  srshr   d0, d0, #1
+  srsra   d0, d0, #1
+  sshr    d0, d0, #1
+  ucvtf   s0, s0, #1
+  ucvtf   d0, d0, #2
+  scvtf   s0, s0, #1
+  scvtf   d0, d0, #2
+  uqrshrn b0, h0, #1
+  uqrshrn h0, s0, #2
+  uqrshrn s0, d0, #3
+  uqshl   b0, b0, #1
+  uqshl   h0, h0, #2
+  uqshl   s0, s0, #3
+  uqshl   d0, d0, #4
+  uqshrn  b0, h0, #1
+  uqshrn  h0, s0, #2
+  uqshrn  s0, d0, #3
+  urshr   d0, d0, #1
+  ursra   d0, d0, #1
+  ushr    d0, d0, #1
+  usra    d0, d0, #1
+
+; CHECK: fcvtzs	s0, s0, #1              ; encoding: [0x00,0xfc,0x3f,0x5f]
+; CHECK: fcvtzs	d0, d0, #2              ; encoding: [0x00,0xfc,0x7e,0x5f]
+; CHECK: fcvtzu	s0, s0, #1              ; encoding: [0x00,0xfc,0x3f,0x7f]
+; CHECK: fcvtzu	d0, d0, #2              ; encoding: [0x00,0xfc,0x7e,0x7f]
+; CHECK: shl	d0, d0, #1              ; encoding: [0x00,0x54,0x41,0x5f]
+; CHECK: sli	d0, d0, #1              ; encoding: [0x00,0x54,0x41,0x7f]
+; CHECK: sqrshrn	b0, h0, #1              ; encoding: [0x00,0x9c,0x0f,0x5f]
+; CHECK: sqrshrn	h0, s0, #2              ; encoding: [0x00,0x9c,0x1e,0x5f]
+; CHECK: sqrshrn	s0, d0, #3              ; encoding: [0x00,0x9c,0x3d,0x5f]
+; CHECK: sqrshrun	b0, h0, #1      ; encoding: [0x00,0x8c,0x0f,0x7f]
+; CHECK: sqrshrun	h0, s0, #2      ; encoding: [0x00,0x8c,0x1e,0x7f]
+; CHECK: sqrshrun	s0, d0, #3      ; encoding: [0x00,0x8c,0x3d,0x7f]
+; CHECK: sqshlu	b0, b0, #1              ; encoding: [0x00,0x64,0x09,0x7f]
+; CHECK: sqshlu	h0, h0, #2              ; encoding: [0x00,0x64,0x12,0x7f]
+; CHECK: sqshlu	s0, s0, #3              ; encoding: [0x00,0x64,0x23,0x7f]
+; CHECK: sqshlu	d0, d0, #4              ; encoding: [0x00,0x64,0x44,0x7f]
+; CHECK: sqshl	b0, b0, #1              ; encoding: [0x00,0x74,0x09,0x5f]
+; CHECK: sqshl	h0, h0, #2              ; encoding: [0x00,0x74,0x12,0x5f]
+; CHECK: sqshl	s0, s0, #3              ; encoding: [0x00,0x74,0x23,0x5f]
+; CHECK: sqshl	d0, d0, #4              ; encoding: [0x00,0x74,0x44,0x5f]
+; CHECK: sqshrn	b0, h0, #1              ; encoding: [0x00,0x94,0x0f,0x5f]
+; CHECK: sqshrn	h0, s0, #2              ; encoding: [0x00,0x94,0x1e,0x5f]
+; CHECK: sqshrn	s0, d0, #3              ; encoding: [0x00,0x94,0x3d,0x5f]
+; CHECK: sqshrun	b0, h0, #1              ; encoding: [0x00,0x84,0x0f,0x7f]
+; CHECK: sqshrun	h0, s0, #2              ; encoding: [0x00,0x84,0x1e,0x7f]
+; CHECK: sqshrun	s0, d0, #3              ; encoding: [0x00,0x84,0x3d,0x7f]
+; CHECK: sri	d0, d0, #1              ; encoding: [0x00,0x44,0x7f,0x7f]
+; CHECK: srshr	d0, d0, #1              ; encoding: [0x00,0x24,0x7f,0x5f]
+; CHECK: srsra	d0, d0, #1              ; encoding: [0x00,0x34,0x7f,0x5f]
+; CHECK: sshr	d0, d0, #1              ; encoding: [0x00,0x04,0x7f,0x5f]
+; CHECK: ucvtf	s0, s0, #1              ; encoding: [0x00,0xe4,0x3f,0x7f]
+; CHECK: ucvtf	d0, d0, #2              ; encoding: [0x00,0xe4,0x7e,0x7f]
+; check: scvtf  s0, s0, #1              ; encoding: [0x00,0xe4,0x3f,0x5f]
+; check: scvtf  d0, d0, #2              ; encoding: [0x00,0xe4,0x7e,0x5f]
+; CHECK: uqrshrn	b0, h0, #1              ; encoding: [0x00,0x9c,0x0f,0x7f]
+; CHECK: uqrshrn	h0, s0, #2              ; encoding: [0x00,0x9c,0x1e,0x7f]
+; CHECK: uqrshrn	s0, d0, #3              ; encoding: [0x00,0x9c,0x3d,0x7f]
+; CHECK: uqshl	b0, b0, #1              ; encoding: [0x00,0x74,0x09,0x7f]
+; CHECK: uqshl	h0, h0, #2              ; encoding: [0x00,0x74,0x12,0x7f]
+; CHECK: uqshl	s0, s0, #3              ; encoding: [0x00,0x74,0x23,0x7f]
+; CHECK: uqshl	d0, d0, #4              ; encoding: [0x00,0x74,0x44,0x7f]
+; CHECK: uqshrn	b0, h0, #1              ; encoding: [0x00,0x94,0x0f,0x7f]
+; CHECK: uqshrn	h0, s0, #2              ; encoding: [0x00,0x94,0x1e,0x7f]
+; CHECK: uqshrn	s0, d0, #3              ; encoding: [0x00,0x94,0x3d,0x7f]
+; CHECK: urshr	d0, d0, #1              ; encoding: [0x00,0x24,0x7f,0x7f]
+; CHECK: ursra	d0, d0, #1              ; encoding: [0x00,0x34,0x7f,0x7f]
+; CHECK: ushr	d0, d0, #1              ; encoding: [0x00,0x04,0x7f,0x7f]
+; CHECK: usra	d0, d0, #1              ; encoding: [0x00,0x14,0x7f,0x7f]
+
+
+;===-------------------------------------------------------------------------===
+; AdvSIMD vector with shift
+;===-------------------------------------------------------------------------===
+
+   fcvtzs.2s v0, v0, #1
+   fcvtzs.4s v0, v0, #2
+   fcvtzs.2d v0, v0, #3
+   fcvtzu.2s v0, v0, #1
+   fcvtzu.4s v0, v0, #2
+   fcvtzu.2d v0, v0, #3
+   rshrn.8b v0, v0, #1
+   rshrn2.16b v0, v0, #2
+   rshrn.4h v0, v0, #3
+   rshrn2.8h v0, v0, #4
+   rshrn.2s v0, v0, #5
+   rshrn2.4s v0, v0, #6
+   scvtf.2s v0, v0, #1
+   scvtf.4s v0, v0, #2
+   scvtf.2d v0, v0, #3
+   shl.8b v0, v0, #1
+   shl.16b v0, v0, #2
+   shl.4h v0, v0, #3
+   shl.8h v0, v0, #4
+   shl.2s v0, v0, #5
+   shl.4s v0, v0, #6
+   shl.2d v0, v0, #7
+   shrn.8b v0, v0, #1
+   shrn2.16b v0, v0, #2
+   shrn.4h v0, v0, #3
+   shrn2.8h v0, v0, #4
+   shrn.2s v0, v0, #5
+   shrn2.4s v0, v0, #6
+   sli.8b v0, v0, #1
+   sli.16b v0, v0, #2
+   sli.4h v0, v0, #3
+   sli.8h v0, v0, #4
+   sli.2s v0, v0, #5
+   sli.4s v0, v0, #6
+   sli.2d v0, v0, #7
+   sqrshrn.8b v0, v0, #1
+   sqrshrn2.16b v0, v0, #2
+   sqrshrn.4h v0, v0, #3
+   sqrshrn2.8h v0, v0, #4
+   sqrshrn.2s v0, v0, #5
+   sqrshrn2.4s v0, v0, #6
+   sqrshrun.8b v0, v0, #1
+   sqrshrun2.16b v0, v0, #2
+   sqrshrun.4h v0, v0, #3
+   sqrshrun2.8h v0, v0, #4
+   sqrshrun.2s v0, v0, #5
+   sqrshrun2.4s v0, v0, #6
+   sqshlu.8b v0, v0, #1
+   sqshlu.16b v0, v0, #2
+   sqshlu.4h v0, v0, #3
+   sqshlu.8h v0, v0, #4
+   sqshlu.2s v0, v0, #5
+   sqshlu.4s v0, v0, #6
+   sqshlu.2d v0, v0, #7
+   sqshl.8b v0, v0, #1
+   sqshl.16b v0, v0, #2
+   sqshl.4h v0, v0, #3
+   sqshl.8h v0, v0, #4
+   sqshl.2s v0, v0, #5
+   sqshl.4s v0, v0, #6
+   sqshl.2d v0, v0, #7
+   sqshrn.8b v0, v0, #1
+   sqshrn2.16b v0, v0, #2
+   sqshrn.4h v0, v0, #3
+   sqshrn2.8h v0, v0, #4
+   sqshrn.2s v0, v0, #5
+   sqshrn2.4s v0, v0, #6
+   sqshrun.8b v0, v0, #1
+   sqshrun2.16b v0, v0, #2
+   sqshrun.4h v0, v0, #3
+   sqshrun2.8h v0, v0, #4
+   sqshrun.2s v0, v0, #5
+   sqshrun2.4s v0, v0, #6
+   sri.8b v0, v0, #1
+   sri.16b v0, v0, #2
+   sri.4h v0, v0, #3
+   sri.8h v0, v0, #4
+   sri.2s v0, v0, #5
+   sri.4s v0, v0, #6
+   sri.2d v0, v0, #7
+   srshr.8b v0, v0, #1
+   srshr.16b v0, v0, #2
+   srshr.4h v0, v0, #3
+   srshr.8h v0, v0, #4
+   srshr.2s v0, v0, #5
+   srshr.4s v0, v0, #6
+   srshr.2d v0, v0, #7
+   srsra.8b v0, v0, #1
+   srsra.16b v0, v0, #2
+   srsra.4h v0, v0, #3
+   srsra.8h v0, v0, #4
+   srsra.2s v0, v0, #5
+   srsra.4s v0, v0, #6
+   srsra.2d v0, v0, #7
+   sshll.8h v0, v0, #1
+   sshll2.8h v0, v0, #2
+   sshll.4s v0, v0, #3
+   sshll2.4s v0, v0, #4
+   sshll.2d v0, v0, #5
+   sshll2.2d v0, v0, #6
+   sshr.8b v0, v0, #1
+   sshr.16b v0, v0, #2
+   sshr.4h v0, v0, #3
+   sshr.8h v0, v0, #4
+   sshr.2s v0, v0, #5
+   sshr.4s v0, v0, #6
+   sshr.2d v0, v0, #7
+   sshr.8b v0, v0, #1
+   ssra.16b v0, v0, #2
+   ssra.4h v0, v0, #3
+   ssra.8h v0, v0, #4
+   ssra.2s v0, v0, #5
+   ssra.4s v0, v0, #6
+   ssra.2d v0, v0, #7
+   ssra d0, d0, #64
+   ucvtf.2s v0, v0, #1
+   ucvtf.4s v0, v0, #2
+   ucvtf.2d v0, v0, #3
+   uqrshrn.8b v0, v0, #1
+   uqrshrn2.16b v0, v0, #2
+   uqrshrn.4h v0, v0, #3
+   uqrshrn2.8h v0, v0, #4
+   uqrshrn.2s v0, v0, #5
+   uqrshrn2.4s v0, v0, #6
+   uqshl.8b v0, v0, #1
+   uqshl.16b v0, v0, #2
+   uqshl.4h v0, v0, #3
+   uqshl.8h v0, v0, #4
+   uqshl.2s v0, v0, #5
+   uqshl.4s v0, v0, #6
+   uqshl.2d v0, v0, #7
+   uqshrn.8b v0, v0, #1
+   uqshrn2.16b v0, v0, #2
+   uqshrn.4h v0, v0, #3
+   uqshrn2.8h v0, v0, #4
+   uqshrn.2s v0, v0, #5
+   uqshrn2.4s v0, v0, #6
+   urshr.8b v0, v0, #1
+   urshr.16b v0, v0, #2
+   urshr.4h v0, v0, #3
+   urshr.8h v0, v0, #4
+   urshr.2s v0, v0, #5
+   urshr.4s v0, v0, #6
+   urshr.2d v0, v0, #7
+   ursra.8b v0, v0, #1
+   ursra.16b v0, v0, #2
+   ursra.4h v0, v0, #3
+   ursra.8h v0, v0, #4
+   ursra.2s v0, v0, #5
+   ursra.4s v0, v0, #6
+   ursra.2d v0, v0, #7
+   ushll.8h v0, v0, #1
+   ushll2.8h v0, v0, #2
+   ushll.4s v0, v0, #3
+   ushll2.4s v0, v0, #4
+   ushll.2d v0, v0, #5
+   ushll2.2d v0, v0, #6
+   ushr.8b v0, v0, #1
+   ushr.16b v0, v0, #2
+   ushr.4h v0, v0, #3
+   ushr.8h v0, v0, #4
+   ushr.2s v0, v0, #5
+   ushr.4s v0, v0, #6
+   ushr.2d v0, v0, #7
+   usra.8b v0, v0, #1
+   usra.16b v0, v0, #2
+   usra.4h v0, v0, #3
+   usra.8h v0, v0, #4
+   usra.2s v0, v0, #5
+   usra.4s v0, v0, #6
+   usra.2d v0, v0, #7
+
+; CHECK: fcvtzs.2s	v0, v0, #1      ; encoding: [0x00,0xfc,0x3f,0x0f]
+; CHECK: fcvtzs.4s	v0, v0, #2      ; encoding: [0x00,0xfc,0x3e,0x4f]
+; CHECK: fcvtzs.2d	v0, v0, #3      ; encoding: [0x00,0xfc,0x7d,0x4f]
+; CHECK: fcvtzu.2s	v0, v0, #1      ; encoding: [0x00,0xfc,0x3f,0x2f]
+; CHECK: fcvtzu.4s	v0, v0, #2      ; encoding: [0x00,0xfc,0x3e,0x6f]
+; CHECK: fcvtzu.2d	v0, v0, #3      ; encoding: [0x00,0xfc,0x7d,0x6f]
+; CHECK: rshrn.8b	v0, v0, #1      ; encoding: [0x00,0x8c,0x0f,0x0f]
+; CHECK: rshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x8c,0x0e,0x4f]
+; CHECK: rshrn.4h	v0, v0, #3      ; encoding: [0x00,0x8c,0x1d,0x0f]
+; CHECK: rshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x8c,0x1c,0x4f]
+; CHECK: rshrn.2s	v0, v0, #5      ; encoding: [0x00,0x8c,0x3b,0x0f]
+; CHECK: rshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x8c,0x3a,0x4f]
+; CHECK: scvtf.2s	v0, v0, #1      ; encoding: [0x00,0xe4,0x3f,0x0f]
+; CHECK: scvtf.4s	v0, v0, #2      ; encoding: [0x00,0xe4,0x3e,0x4f]
+; CHECK: scvtf.2d	v0, v0, #3      ; encoding: [0x00,0xe4,0x7d,0x4f]
+; CHECK: shl.8b	v0, v0, #1              ; encoding: [0x00,0x54,0x09,0x0f]
+; CHECK: shl.16b	v0, v0, #2              ; encoding: [0x00,0x54,0x0a,0x4f]
+; CHECK: shl.4h	v0, v0, #3              ; encoding: [0x00,0x54,0x13,0x0f]
+; CHECK: shl.8h	v0, v0, #4              ; encoding: [0x00,0x54,0x14,0x4f]
+; CHECK: shl.2s	v0, v0, #5              ; encoding: [0x00,0x54,0x25,0x0f]
+; CHECK: shl.4s	v0, v0, #6              ; encoding: [0x00,0x54,0x26,0x4f]
+; CHECK: shl.2d	v0, v0, #7              ; encoding: [0x00,0x54,0x47,0x4f]
+; CHECK: shrn.8b	v0, v0, #1              ; encoding: [0x00,0x84,0x0f,0x0f]
+; CHECK: shrn2.16b	v0, v0, #2      ; encoding: [0x00,0x84,0x0e,0x4f]
+; CHECK: shrn.4h	v0, v0, #3              ; encoding: [0x00,0x84,0x1d,0x0f]
+; CHECK: shrn2.8h	v0, v0, #4      ; encoding: [0x00,0x84,0x1c,0x4f]
+; CHECK: shrn.2s	v0, v0, #5              ; encoding: [0x00,0x84,0x3b,0x0f]
+; CHECK: shrn2.4s	v0, v0, #6      ; encoding: [0x00,0x84,0x3a,0x4f]
+; CHECK: sli.8b	v0, v0, #1              ; encoding: [0x00,0x54,0x09,0x2f]
+; CHECK: sli.16b	v0, v0, #2              ; encoding: [0x00,0x54,0x0a,0x6f]
+; CHECK: sli.4h	v0, v0, #3              ; encoding: [0x00,0x54,0x13,0x2f]
+; CHECK: sli.8h	v0, v0, #4              ; encoding: [0x00,0x54,0x14,0x6f]
+; CHECK: sli.2s	v0, v0, #5              ; encoding: [0x00,0x54,0x25,0x2f]
+; CHECK: sli.4s	v0, v0, #6              ; encoding: [0x00,0x54,0x26,0x6f]
+; CHECK: sli.2d	v0, v0, #7              ; encoding: [0x00,0x54,0x47,0x6f]
+; CHECK: sqrshrn.8b	v0, v0, #1      ; encoding: [0x00,0x9c,0x0f,0x0f]
+; CHECK: sqrshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x9c,0x0e,0x4f]
+; CHECK: sqrshrn.4h	v0, v0, #3      ; encoding: [0x00,0x9c,0x1d,0x0f]
+; CHECK: sqrshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x9c,0x1c,0x4f]
+; CHECK: sqrshrn.2s	v0, v0, #5      ; encoding: [0x00,0x9c,0x3b,0x0f]
+; CHECK: sqrshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x9c,0x3a,0x4f]
+; CHECK: sqrshrun.8b	v0, v0, #1      ; encoding: [0x00,0x8c,0x0f,0x2f]
+; CHECK: sqrshrun2.16b	v0, v0, #2      ; encoding: [0x00,0x8c,0x0e,0x6f]
+; CHECK: sqrshrun.4h	v0, v0, #3      ; encoding: [0x00,0x8c,0x1d,0x2f]
+; CHECK: sqrshrun2.8h	v0, v0, #4      ; encoding: [0x00,0x8c,0x1c,0x6f]
+; CHECK: sqrshrun.2s	v0, v0, #5      ; encoding: [0x00,0x8c,0x3b,0x2f]
+; CHECK: sqrshrun2.4s	v0, v0, #6      ; encoding: [0x00,0x8c,0x3a,0x6f]
+; CHECK: sqshlu.8b	v0, v0, #1      ; encoding: [0x00,0x64,0x09,0x2f]
+; CHECK: sqshlu.16b	v0, v0, #2      ; encoding: [0x00,0x64,0x0a,0x6f]
+; CHECK: sqshlu.4h	v0, v0, #3      ; encoding: [0x00,0x64,0x13,0x2f]
+; CHECK: sqshlu.8h	v0, v0, #4      ; encoding: [0x00,0x64,0x14,0x6f]
+; CHECK: sqshlu.2s	v0, v0, #5      ; encoding: [0x00,0x64,0x25,0x2f]
+; CHECK: sqshlu.4s	v0, v0, #6      ; encoding: [0x00,0x64,0x26,0x6f]
+; CHECK: sqshlu.2d	v0, v0, #7      ; encoding: [0x00,0x64,0x47,0x6f]
+; CHECK: sqshl.8b	v0, v0, #1      ; encoding: [0x00,0x74,0x09,0x0f]
+; CHECK: sqshl.16b	v0, v0, #2      ; encoding: [0x00,0x74,0x0a,0x4f]
+; CHECK: sqshl.4h	v0, v0, #3      ; encoding: [0x00,0x74,0x13,0x0f]
+; CHECK: sqshl.8h	v0, v0, #4      ; encoding: [0x00,0x74,0x14,0x4f]
+; CHECK: sqshl.2s	v0, v0, #5      ; encoding: [0x00,0x74,0x25,0x0f]
+; CHECK: sqshl.4s	v0, v0, #6      ; encoding: [0x00,0x74,0x26,0x4f]
+; CHECK: sqshl.2d	v0, v0, #7      ; encoding: [0x00,0x74,0x47,0x4f]
+; CHECK: sqshrn.8b	v0, v0, #1      ; encoding: [0x00,0x94,0x0f,0x0f]
+; CHECK: sqshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x94,0x0e,0x4f]
+; CHECK: sqshrn.4h	v0, v0, #3      ; encoding: [0x00,0x94,0x1d,0x0f]
+; CHECK: sqshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x94,0x1c,0x4f]
+; CHECK: sqshrn.2s	v0, v0, #5      ; encoding: [0x00,0x94,0x3b,0x0f]
+; CHECK: sqshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x94,0x3a,0x4f]
+; CHECK: sqshrun.8b	v0, v0, #1      ; encoding: [0x00,0x84,0x0f,0x2f]
+; CHECK: sqshrun2.16b	v0, v0, #2      ; encoding: [0x00,0x84,0x0e,0x6f]
+; CHECK: sqshrun.4h	v0, v0, #3      ; encoding: [0x00,0x84,0x1d,0x2f]
+; CHECK: sqshrun2.8h	v0, v0, #4      ; encoding: [0x00,0x84,0x1c,0x6f]
+; CHECK: sqshrun.2s	v0, v0, #5      ; encoding: [0x00,0x84,0x3b,0x2f]
+; CHECK: sqshrun2.4s	v0, v0, #6      ; encoding: [0x00,0x84,0x3a,0x6f]
+; CHECK: sri.8b	v0, v0, #1              ; encoding: [0x00,0x44,0x0f,0x2f]
+; CHECK: sri.16b	v0, v0, #2              ; encoding: [0x00,0x44,0x0e,0x6f]
+; CHECK: sri.4h	v0, v0, #3              ; encoding: [0x00,0x44,0x1d,0x2f]
+; CHECK: sri.8h	v0, v0, #4              ; encoding: [0x00,0x44,0x1c,0x6f]
+; CHECK: sri.2s	v0, v0, #5              ; encoding: [0x00,0x44,0x3b,0x2f]
+; CHECK: sri.4s	v0, v0, #6              ; encoding: [0x00,0x44,0x3a,0x6f]
+; CHECK: sri.2d	v0, v0, #7              ; encoding: [0x00,0x44,0x79,0x6f]
+; CHECK: srshr.8b	v0, v0, #1      ; encoding: [0x00,0x24,0x0f,0x0f]
+; CHECK: srshr.16b	v0, v0, #2      ; encoding: [0x00,0x24,0x0e,0x4f]
+; CHECK: srshr.4h	v0, v0, #3      ; encoding: [0x00,0x24,0x1d,0x0f]
+; CHECK: srshr.8h	v0, v0, #4      ; encoding: [0x00,0x24,0x1c,0x4f]
+; CHECK: srshr.2s	v0, v0, #5      ; encoding: [0x00,0x24,0x3b,0x0f]
+; CHECK: srshr.4s	v0, v0, #6      ; encoding: [0x00,0x24,0x3a,0x4f]
+; CHECK: srshr.2d	v0, v0, #7      ; encoding: [0x00,0x24,0x79,0x4f]
+; CHECK: srsra.8b	v0, v0, #1      ; encoding: [0x00,0x34,0x0f,0x0f]
+; CHECK: srsra.16b	v0, v0, #2      ; encoding: [0x00,0x34,0x0e,0x4f]
+; CHECK: srsra.4h	v0, v0, #3      ; encoding: [0x00,0x34,0x1d,0x0f]
+; CHECK: srsra.8h	v0, v0, #4      ; encoding: [0x00,0x34,0x1c,0x4f]
+; CHECK: srsra.2s	v0, v0, #5      ; encoding: [0x00,0x34,0x3b,0x0f]
+; CHECK: srsra.4s	v0, v0, #6      ; encoding: [0x00,0x34,0x3a,0x4f]
+; CHECK: srsra.2d	v0, v0, #7      ; encoding: [0x00,0x34,0x79,0x4f]
+; CHECK: sshll.8h	v0, v0, #1      ; encoding: [0x00,0xa4,0x09,0x0f]
+; CHECK: sshll2.8h	v0, v0, #2      ; encoding: [0x00,0xa4,0x0a,0x4f]
+; CHECK: sshll.4s	v0, v0, #3      ; encoding: [0x00,0xa4,0x13,0x0f]
+; CHECK: sshll2.4s	v0, v0, #4      ; encoding: [0x00,0xa4,0x14,0x4f]
+; CHECK: sshll.2d	v0, v0, #5      ; encoding: [0x00,0xa4,0x25,0x0f]
+; CHECK: sshll2.2d	v0, v0, #6      ; encoding: [0x00,0xa4,0x26,0x4f]
+; CHECK: sshr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x0f]
+; CHECK: sshr.16b	v0, v0, #2      ; encoding: [0x00,0x04,0x0e,0x4f]
+; CHECK: sshr.4h	v0, v0, #3              ; encoding: [0x00,0x04,0x1d,0x0f]
+; CHECK: sshr.8h	v0, v0, #4              ; encoding: [0x00,0x04,0x1c,0x4f]
+; CHECK: sshr.2s	v0, v0, #5              ; encoding: [0x00,0x04,0x3b,0x0f]
+; CHECK: sshr.4s	v0, v0, #6              ; encoding: [0x00,0x04,0x3a,0x4f]
+; CHECK: sshr.2d	v0, v0, #7              ; encoding: [0x00,0x04,0x79,0x4f]
+; CHECK: sshr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x0f]
+; CHECK: ssra.16b	v0, v0, #2      ; encoding: [0x00,0x14,0x0e,0x4f]
+; CHECK: ssra.4h	v0, v0, #3              ; encoding: [0x00,0x14,0x1d,0x0f]
+; CHECK: ssra.8h	v0, v0, #4              ; encoding: [0x00,0x14,0x1c,0x4f]
+; CHECK: ssra.2s	v0, v0, #5              ; encoding: [0x00,0x14,0x3b,0x0f]
+; CHECK: ssra.4s	v0, v0, #6              ; encoding: [0x00,0x14,0x3a,0x4f]
+; CHECK: ssra.2d	v0, v0, #7              ; encoding: [0x00,0x14,0x79,0x4f]
+; CHECK: ssra		d0, d0, #64             ; encoding: [0x00,0x14,0x40,0x5f]
+; CHECK: ucvtf.2s	v0, v0, #1      ; encoding: [0x00,0xe4,0x3f,0x2f]
+; CHECK: ucvtf.4s	v0, v0, #2      ; encoding: [0x00,0xe4,0x3e,0x6f]
+; CHECK: ucvtf.2d	v0, v0, #3      ; encoding: [0x00,0xe4,0x7d,0x6f]
+; CHECK: uqrshrn.8b	v0, v0, #1      ; encoding: [0x00,0x9c,0x0f,0x2f]
+; CHECK: uqrshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x9c,0x0e,0x6f]
+; CHECK: uqrshrn.4h	v0, v0, #3      ; encoding: [0x00,0x9c,0x1d,0x2f]
+; CHECK: uqrshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x9c,0x1c,0x6f]
+; CHECK: uqrshrn.2s	v0, v0, #5      ; encoding: [0x00,0x9c,0x3b,0x2f]
+; CHECK: uqrshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x9c,0x3a,0x6f]
+; CHECK: uqshl.8b	v0, v0, #1      ; encoding: [0x00,0x74,0x09,0x2f]
+; CHECK: uqshl.16b	v0, v0, #2      ; encoding: [0x00,0x74,0x0a,0x6f]
+; CHECK: uqshl.4h	v0, v0, #3      ; encoding: [0x00,0x74,0x13,0x2f]
+; CHECK: uqshl.8h	v0, v0, #4      ; encoding: [0x00,0x74,0x14,0x6f]
+; CHECK: uqshl.2s	v0, v0, #5      ; encoding: [0x00,0x74,0x25,0x2f]
+; CHECK: uqshl.4s	v0, v0, #6      ; encoding: [0x00,0x74,0x26,0x6f]
+; CHECK: uqshl.2d	v0, v0, #7      ; encoding: [0x00,0x74,0x47,0x6f]
+; CHECK: uqshrn.8b	v0, v0, #1      ; encoding: [0x00,0x94,0x0f,0x2f]
+; CHECK: uqshrn2.16b	v0, v0, #2      ; encoding: [0x00,0x94,0x0e,0x6f]
+; CHECK: uqshrn.4h	v0, v0, #3      ; encoding: [0x00,0x94,0x1d,0x2f]
+; CHECK: uqshrn2.8h	v0, v0, #4      ; encoding: [0x00,0x94,0x1c,0x6f]
+; CHECK: uqshrn.2s	v0, v0, #5      ; encoding: [0x00,0x94,0x3b,0x2f]
+; CHECK: uqshrn2.4s	v0, v0, #6      ; encoding: [0x00,0x94,0x3a,0x6f]
+; CHECK: urshr.8b	v0, v0, #1      ; encoding: [0x00,0x24,0x0f,0x2f]
+; CHECK: urshr.16b	v0, v0, #2      ; encoding: [0x00,0x24,0x0e,0x6f]
+; CHECK: urshr.4h	v0, v0, #3      ; encoding: [0x00,0x24,0x1d,0x2f]
+; CHECK: urshr.8h	v0, v0, #4      ; encoding: [0x00,0x24,0x1c,0x6f]
+; CHECK: urshr.2s	v0, v0, #5      ; encoding: [0x00,0x24,0x3b,0x2f]
+; CHECK: urshr.4s	v0, v0, #6      ; encoding: [0x00,0x24,0x3a,0x6f]
+; CHECK: urshr.2d	v0, v0, #7      ; encoding: [0x00,0x24,0x79,0x6f]
+; CHECK: ursra.8b	v0, v0, #1      ; encoding: [0x00,0x34,0x0f,0x2f]
+; CHECK: ursra.16b	v0, v0, #2      ; encoding: [0x00,0x34,0x0e,0x6f]
+; CHECK: ursra.4h	v0, v0, #3      ; encoding: [0x00,0x34,0x1d,0x2f]
+; CHECK: ursra.8h	v0, v0, #4      ; encoding: [0x00,0x34,0x1c,0x6f]
+; CHECK: ursra.2s	v0, v0, #5      ; encoding: [0x00,0x34,0x3b,0x2f]
+; CHECK: ursra.4s	v0, v0, #6      ; encoding: [0x00,0x34,0x3a,0x6f]
+; CHECK: ursra.2d	v0, v0, #7      ; encoding: [0x00,0x34,0x79,0x6f]
+; CHECK: ushll.8h	v0, v0, #1      ; encoding: [0x00,0xa4,0x09,0x2f]
+; CHECK: ushll2.8h	v0, v0, #2      ; encoding: [0x00,0xa4,0x0a,0x6f]
+; CHECK: ushll.4s	v0, v0, #3      ; encoding: [0x00,0xa4,0x13,0x2f]
+; CHECK: ushll2.4s	v0, v0, #4      ; encoding: [0x00,0xa4,0x14,0x6f]
+; CHECK: ushll.2d	v0, v0, #5      ; encoding: [0x00,0xa4,0x25,0x2f]
+; CHECK: ushll2.2d	v0, v0, #6      ; encoding: [0x00,0xa4,0x26,0x6f]
+; CHECK: ushr.8b	v0, v0, #1              ; encoding: [0x00,0x04,0x0f,0x2f]
+; CHECK: ushr.16b	v0, v0, #2      ; encoding: [0x00,0x04,0x0e,0x6f]
+; CHECK: ushr.4h	v0, v0, #3              ; encoding: [0x00,0x04,0x1d,0x2f]
+; CHECK: ushr.8h	v0, v0, #4              ; encoding: [0x00,0x04,0x1c,0x6f]
+; CHECK: ushr.2s	v0, v0, #5              ; encoding: [0x00,0x04,0x3b,0x2f]
+; CHECK: ushr.4s	v0, v0, #6              ; encoding: [0x00,0x04,0x3a,0x6f]
+; CHECK: ushr.2d	v0, v0, #7              ; encoding: [0x00,0x04,0x79,0x6f]
+; CHECK: usra.8b	v0, v0, #1              ; encoding: [0x00,0x14,0x0f,0x2f]
+; CHECK: usra.16b	v0, v0, #2      ; encoding: [0x00,0x14,0x0e,0x6f]
+; CHECK: usra.4h	v0, v0, #3              ; encoding: [0x00,0x14,0x1d,0x2f]
+; CHECK: usra.8h	v0, v0, #4              ; encoding: [0x00,0x14,0x1c,0x6f]
+; CHECK: usra.2s	v0, v0, #5              ; encoding: [0x00,0x14,0x3b,0x2f]
+; CHECK: usra.4s	v0, v0, #6              ; encoding: [0x00,0x14,0x3a,0x6f]
+; CHECK: usra.2d	v0, v0, #7              ; encoding: [0x00,0x14,0x79,0x6f]
+
+
+; ARM Verbose syntax variants.
+
+   rshrn v9.8b, v11.8h, #1
+   rshrn2 v8.16b, v9.8h, #2
+   rshrn v7.4h, v8.4s, #3
+   rshrn2 v6.8h, v7.4s, #4
+   rshrn v5.2s, v6.2d, #5
+   rshrn2 v4.4s, v5.2d, #6
+
+   shrn v9.8b, v11.8h, #1
+   shrn2 v8.16b, v9.8h, #2
+   shrn v7.4h, v8.4s, #3
+   shrn2 v6.8h, v7.4s, #4
+   shrn v5.2s, v6.2d, #5
+   shrn2 v4.4s, v5.2d, #6
+
+   sqrshrn v9.8b, v11.8h, #1
+   sqrshrn2 v8.16b, v9.8h, #2
+   sqrshrn v7.4h, v8.4s, #3
+   sqrshrn2 v6.8h, v7.4s, #4
+   sqrshrn v5.2s, v6.2d, #5
+   sqrshrn2 v4.4s, v5.2d, #6
+
+   sqshrn v9.8b, v11.8h, #1
+   sqshrn2 v8.16b, v9.8h, #2
+   sqshrn v7.4h, v8.4s, #3
+   sqshrn2 v6.8h, v7.4s, #4
+   sqshrn v5.2s, v6.2d, #5
+   sqshrn2 v4.4s, v5.2d, #6
+
+   sqrshrun v9.8b, v11.8h, #1
+   sqrshrun2 v8.16b, v9.8h, #2
+   sqrshrun v7.4h, v8.4s, #3
+   sqrshrun2 v6.8h, v7.4s, #4
+   sqrshrun v5.2s, v6.2d, #5
+   sqrshrun2 v4.4s, v5.2d, #6
+
+   sqshrun v9.8b, v11.8h, #1
+   sqshrun2 v8.16b, v9.8h, #2
+   sqshrun v7.4h, v8.4s, #3
+   sqshrun2 v6.8h, v7.4s, #4
+   sqshrun v5.2s, v6.2d, #5
+   sqshrun2 v4.4s, v5.2d, #6
+
+   uqrshrn v9.8b, v11.8h, #1
+   uqrshrn2 v8.16b, v9.8h, #2
+   uqrshrn v7.4h, v8.4s, #3
+   uqrshrn2 v6.8h, v7.4s, #4
+   uqrshrn v5.2s, v6.2d, #5
+   uqrshrn2 v4.4s, v5.2d, #6
+
+   uqshrn v9.8b, v11.8h, #1
+   uqshrn2 v8.16b, v9.8h, #2
+   uqshrn v7.4h, v8.4s, #3
+   uqshrn2 v6.8h, v7.4s, #4
+   uqshrn v5.2s, v6.2d, #5
+   uqshrn2 v4.4s, v5.2d, #6
+
+   sshll2 v10.8h, v3.16b, #6
+   sshll2 v11.4s, v4.8h, #5
+   sshll2 v12.2d, v5.4s, #4
+   sshll v13.8h, v6.8b, #3
+   sshll v14.4s, v7.4h, #2
+   sshll v15.2d, v8.2s, #7
+
+   ushll2 v10.8h, v3.16b, #6
+   ushll2 v11.4s, v4.8h, #5
+   ushll2 v12.2d, v5.4s, #4
+   ushll v13.8h, v6.8b, #3
+   ushll v14.4s, v7.4h, #2
+   ushll v15.2d, v8.2s, #7
+
+
+; CHECK: rshrn.8b	v9, v11, #1     ; encoding: [0x69,0x8d,0x0f,0x0f]
+; CHECK: rshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x8d,0x0e,0x4f]
+; CHECK: rshrn.4h	v7, v8, #3      ; encoding: [0x07,0x8d,0x1d,0x0f]
+; CHECK: rshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x8c,0x1c,0x4f]
+; CHECK: rshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x8c,0x3b,0x0f]
+; CHECK: rshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x8c,0x3a,0x4f]
+; CHECK: shrn.8b	v9, v11, #1             ; encoding: [0x69,0x85,0x0f,0x0f]
+; CHECK: shrn2.16b	v8, v9, #2      ; encoding: [0x28,0x85,0x0e,0x4f]
+; CHECK: shrn.4h	v7, v8, #3              ; encoding: [0x07,0x85,0x1d,0x0f]
+; CHECK: shrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x84,0x1c,0x4f]
+; CHECK: shrn.2s	v5, v6, #5              ; encoding: [0xc5,0x84,0x3b,0x0f]
+; CHECK: shrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x84,0x3a,0x4f]
+; CHECK: sqrshrn.8b	v9, v11, #1     ; encoding: [0x69,0x9d,0x0f,0x0f]
+; CHECK: sqrshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x9d,0x0e,0x4f]
+; CHECK: sqrshrn.4h	v7, v8, #3      ; encoding: [0x07,0x9d,0x1d,0x0f]
+; CHECK: sqrshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x9c,0x1c,0x4f]
+; CHECK: sqrshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x9c,0x3b,0x0f]
+; CHECK: sqrshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x9c,0x3a,0x4f]
+; CHECK: sqshrn.8b	v9, v11, #1     ; encoding: [0x69,0x95,0x0f,0x0f]
+; CHECK: sqshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x95,0x0e,0x4f]
+; CHECK: sqshrn.4h	v7, v8, #3      ; encoding: [0x07,0x95,0x1d,0x0f]
+; CHECK: sqshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x94,0x1c,0x4f]
+; CHECK: sqshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x94,0x3b,0x0f]
+; CHECK: sqshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x94,0x3a,0x4f]
+; CHECK: sqrshrun.8b	v9, v11, #1     ; encoding: [0x69,0x8d,0x0f,0x2f]
+; CHECK: sqrshrun2.16b	v8, v9, #2      ; encoding: [0x28,0x8d,0x0e,0x6f]
+; CHECK: sqrshrun.4h	v7, v8, #3      ; encoding: [0x07,0x8d,0x1d,0x2f]
+; CHECK: sqrshrun2.8h	v6, v7, #4      ; encoding: [0xe6,0x8c,0x1c,0x6f]
+; CHECK: sqrshrun.2s	v5, v6, #5      ; encoding: [0xc5,0x8c,0x3b,0x2f]
+; CHECK: sqrshrun2.4s	v4, v5, #6      ; encoding: [0xa4,0x8c,0x3a,0x6f]
+; CHECK: sqshrun.8b	v9, v11, #1     ; encoding: [0x69,0x85,0x0f,0x2f]
+; CHECK: sqshrun2.16b	v8, v9, #2      ; encoding: [0x28,0x85,0x0e,0x6f]
+; CHECK: sqshrun.4h	v7, v8, #3      ; encoding: [0x07,0x85,0x1d,0x2f]
+; CHECK: sqshrun2.8h	v6, v7, #4      ; encoding: [0xe6,0x84,0x1c,0x6f]
+; CHECK: sqshrun.2s	v5, v6, #5      ; encoding: [0xc5,0x84,0x3b,0x2f]
+; CHECK: sqshrun2.4s	v4, v5, #6      ; encoding: [0xa4,0x84,0x3a,0x6f]
+; CHECK: uqrshrn.8b	v9, v11, #1     ; encoding: [0x69,0x9d,0x0f,0x2f]
+; CHECK: uqrshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x9d,0x0e,0x6f]
+; CHECK: uqrshrn.4h	v7, v8, #3      ; encoding: [0x07,0x9d,0x1d,0x2f]
+; CHECK: uqrshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x9c,0x1c,0x6f]
+; CHECK: uqrshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x9c,0x3b,0x2f]
+; CHECK: uqrshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x9c,0x3a,0x6f]
+; CHECK: uqshrn.8b	v9, v11, #1     ; encoding: [0x69,0x95,0x0f,0x2f]
+; CHECK: uqshrn2.16b	v8, v9, #2      ; encoding: [0x28,0x95,0x0e,0x6f]
+; CHECK: uqshrn.4h	v7, v8, #3      ; encoding: [0x07,0x95,0x1d,0x2f]
+; CHECK: uqshrn2.8h	v6, v7, #4      ; encoding: [0xe6,0x94,0x1c,0x6f]
+; CHECK: uqshrn.2s	v5, v6, #5      ; encoding: [0xc5,0x94,0x3b,0x2f]
+; CHECK: uqshrn2.4s	v4, v5, #6      ; encoding: [0xa4,0x94,0x3a,0x6f]
+; CHECK: sshll2.8h	v10, v3, #6     ; encoding: [0x6a,0xa4,0x0e,0x4f]
+; CHECK: sshll2.4s	v11, v4, #5     ; encoding: [0x8b,0xa4,0x15,0x4f]
+; CHECK: sshll2.2d	v12, v5, #4     ; encoding: [0xac,0xa4,0x24,0x4f]
+; CHECK: sshll.8h	v13, v6, #3     ; encoding: [0xcd,0xa4,0x0b,0x0f]
+; CHECK: sshll.4s	v14, v7, #2     ; encoding: [0xee,0xa4,0x12,0x0f]
+; CHECK: sshll.2d	v15, v8, #7     ; encoding: [0x0f,0xa5,0x27,0x0f]
+; CHECK: ushll2.8h	v10, v3, #6     ; encoding: [0x6a,0xa4,0x0e,0x6f]
+; CHECK: ushll2.4s	v11, v4, #5     ; encoding: [0x8b,0xa4,0x15,0x6f]
+; CHECK: ushll2.2d	v12, v5, #4     ; encoding: [0xac,0xa4,0x24,0x6f]
+; CHECK: ushll.8h	v13, v6, #3     ; encoding: [0xcd,0xa4,0x0b,0x2f]
+; CHECK: ushll.4s	v14, v7, #2     ; encoding: [0xee,0xa4,0x12,0x2f]
+; CHECK: ushll.2d	v15, v8, #7     ; encoding: [0x0f,0xa5,0x27,0x2f]
+
+
+  pmull.8h v0, v0, v0
+  pmull2.8h v0, v0, v0
+  pmull.1q v2, v3, v4
+  pmull2.1q v2, v3, v4
+  pmull v2.1q, v3.1d, v4.1d
+  pmull2 v2.1q, v3.2d, v4.2d
+
+; CHECK: pmull.8h	v0, v0, v0      ; encoding: [0x00,0xe0,0x20,0x0e]
+; CHECK: pmull2.8h	v0, v0, v0      ; encoding: [0x00,0xe0,0x20,0x4e]
+; CHECK: pmull.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x0e]
+; CHECK: pmull2.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x4e]
+; CHECK: pmull.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x0e]
+; CHECK: pmull2.1q	v2, v3, v4      ; encoding: [0x62,0xe0,0xe4,0x4e]
+
+
+  faddp.2d d1, v2
+  faddp.2s s3, v4
+; CHECK: faddp.2d	d1, v2          ; encoding: [0x41,0xd8,0x70,0x7e]
+; CHECK: faddp.2s	s3, v4          ; encoding: [0x83,0xd8,0x30,0x7e]
+
+  tbl.16b v2, {v4,v5,v6,v7}, v1
+  tbl.8b v0, {v4,v5,v6,v7}, v1
+  tbl.16b v2, {v5}, v1
+  tbl.8b v0, {v5}, v1
+  tbl.16b v2, {v5,v6,v7}, v1
+  tbl.8b v0, {v5,v6,v7}, v1
+  tbl.16b v2, {v6,v7}, v1
+  tbl.8b v0, {v6,v7}, v1
+; CHECK: tbl.16b	v2, { v4, v5, v6, v7 }, v1 ; encoding: [0x82,0x60,0x01,0x4e]
+; CHECK: tbl.8b	v0, { v4, v5, v6, v7 }, v1 ; encoding: [0x80,0x60,0x01,0x0e]
+; CHECK: tbl.16b	v2, { v5 }, v1          ; encoding: [0xa2,0x00,0x01,0x4e]
+; CHECK: tbl.8b	v0, { v5 }, v1          ; encoding: [0xa0,0x00,0x01,0x0e]
+; CHECK: tbl.16b	v2, { v5, v6, v7 }, v1  ; encoding: [0xa2,0x40,0x01,0x4e]
+; CHECK: tbl.8b	v0, { v5, v6, v7 }, v1  ; encoding: [0xa0,0x40,0x01,0x0e]
+; CHECK: tbl.16b	v2, { v6, v7 }, v1      ; encoding: [0xc2,0x20,0x01,0x4e]
+; CHECK: tbl.8b	v0, { v6, v7 }, v1      ; encoding: [0xc0,0x20,0x01,0x0e]
+
+  tbl v2.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v1.16b
+  tbl v0.8b, {v4.16b,v5.16b,v6.16b,v7.16b}, v1.8b
+  tbl v2.16b, {v5.16b}, v1.16b
+  tbl v0.8b, {v5.16b}, v1.8b
+  tbl v2.16b, {v5.16b,v6.16b,v7.16b}, v1.16b
+  tbl v0.8b, {v5.16b,v6.16b,v7.16b}, v1.8b
+  tbl v2.16b, {v6.16b,v7.16b}, v1.16b
+  tbl v0.8b, {v6.16b,v7.16b}, v1.8b
+; CHECK: tbl.16b v2, { v4, v5, v6, v7 }, v1 ; encoding: [0x82,0x60,0x01,0x4e]
+; CHECK: tbl.8b v0, { v4, v5, v6, v7 }, v1 ; encoding: [0x80,0x60,0x01,0x0e]
+; CHECK: tbl.16b v2, { v5 }, v1          ; encoding: [0xa2,0x00,0x01,0x4e]
+; CHECK: tbl.8b v0, { v5 }, v1          ; encoding: [0xa0,0x00,0x01,0x0e]
+; CHECK: tbl.16b v2, { v5, v6, v7 }, v1  ; encoding: [0xa2,0x40,0x01,0x4e]
+; CHECK: tbl.8b v0, { v5, v6, v7 }, v1  ; encoding: [0xa0,0x40,0x01,0x0e]
+; CHECK: tbl.16b v2, { v6, v7 }, v1      ; encoding: [0xc2,0x20,0x01,0x4e]
+; CHECK: tbl.8b v0, { v6, v7 }, v1      ; encoding: [0xc0,0x20,0x01,0x0e]
+
+  sqdmull	s0, h0, h0
+  sqdmull	d0, s0, s0
+; CHECK: sqdmull	s0, h0, h0              ; encoding: [0x00,0xd0,0x60,0x5e]
+; CHECK: sqdmull	d0, s0, s0              ; encoding: [0x00,0xd0,0xa0,0x5e]
+
+  frsqrte s0, s0
+  frsqrte d0, d0
+; CHECK: frsqrte s0, s0                  ; encoding: [0x00,0xd8,0xa1,0x7e]
+; CHECK: frsqrte d0, d0                  ; encoding: [0x00,0xd8,0xe1,0x7e]
+
+  mov.16b v0, v0
+  mov.2s v0, v0
+; CHECK: orr.16b	v0, v0, v0              ; encoding: [0x00,0x1c,0xa0,0x4e]
+; CHECK: orr.8b	v0, v0, v0              ; encoding: [0x00,0x1c,0xa0,0x0e]
+
+
+; uadalp/sadalp verbose mode aliases.
+  uadalp v14.4h, v25.8b
+  uadalp v15.8h, v24.16b
+  uadalp v16.2s, v23.4h
+  uadalp v17.4s, v22.8h
+  uadalp v18.1d, v21.2s
+  uadalp v19.2d, v20.4s
+
+  sadalp v1.4h, v11.8b
+  sadalp v2.8h, v12.16b
+  sadalp v3.2s, v13.4h
+  sadalp v4.4s, v14.8h
+  sadalp v5.1d, v15.2s
+  sadalp v6.2d, v16.4s
+
+; CHECK: uadalp.4h	v14, v25        ; encoding: [0x2e,0x6b,0x20,0x2e]
+; CHECK: uadalp.8h	v15, v24        ; encoding: [0x0f,0x6b,0x20,0x6e]
+; CHECK: uadalp.2s	v16, v23        ; encoding: [0xf0,0x6a,0x60,0x2e]
+; CHECK: uadalp.4s	v17, v22        ; encoding: [0xd1,0x6a,0x60,0x6e]
+; CHECK: uadalp.1d	v18, v21        ; encoding: [0xb2,0x6a,0xa0,0x2e]
+; CHECK: uadalp.2d	v19, v20        ; encoding: [0x93,0x6a,0xa0,0x6e]
+; CHECK: sadalp.4h	v1, v11         ; encoding: [0x61,0x69,0x20,0x0e]
+; CHECK: sadalp.8h	v2, v12         ; encoding: [0x82,0x69,0x20,0x4e]
+; CHECK: sadalp.2s	v3, v13         ; encoding: [0xa3,0x69,0x60,0x0e]
+; CHECK: sadalp.4s	v4, v14         ; encoding: [0xc4,0x69,0x60,0x4e]
+; CHECK: sadalp.1d	v5, v15         ; encoding: [0xe5,0x69,0xa0,0x0e]
+; CHECK: sadalp.2d	v6, v16         ; encoding: [0x06,0x6a,0xa0,0x4e]
+
+; MVN is an alias for 'not'.
+  mvn v1.8b, v4.8b
+  mvn v19.16b, v17.16b
+  mvn.8b v10, v6
+  mvn.16b v11, v7
+
+; CHECK: not.8b	v1, v4                  ; encoding: [0x81,0x58,0x20,0x2e]
+; CHECK: not.16b	v19, v17                ; encoding: [0x33,0x5a,0x20,0x6e]
+; CHECK: not.8b	v10, v6                 ; encoding: [0xca,0x58,0x20,0x2e]
+; CHECK: not.16b	v11, v7                 ; encoding: [0xeb,0x58,0x20,0x6e]
+
+; sqdmull verbose mode aliases
+ sqdmull v10.4s, v12.4h, v12.4h
+ sqdmull2 v10.4s, v13.8h, v13.8h
+ sqdmull v10.2d, v13.2s, v13.2s
+ sqdmull2 v10.2d, v13.4s, v13.4s
+; CHECK: sqdmull.4s	v10, v12, v12   ; encoding: [0x8a,0xd1,0x6c,0x0e]
+; CHECK: sqdmull2.4s	v10, v13, v13   ; encoding: [0xaa,0xd1,0x6d,0x4e]
+; CHECK: sqdmull.2d	v10, v13, v13   ; encoding: [0xaa,0xd1,0xad,0x0e]
+; CHECK: sqdmull2.2d	v10, v13, v13   ; encoding: [0xaa,0xd1,0xad,0x4e]
+
+; xtn verbose mode aliases
+ xtn v14.8b, v14.8h
+ xtn2 v14.16b, v14.8h
+ xtn v14.4h, v14.4s
+ xtn2 v14.8h, v14.4s
+ xtn v14.2s, v14.2d
+ xtn2 v14.4s, v14.2d
+; CHECK: xtn.8b v14, v14                ; encoding: [0xce,0x29,0x21,0x0e]
+; CHECK: xtn2.16b v14, v14              ; encoding: [0xce,0x29,0x21,0x4e]
+; CHECK: xtn.4h v14, v14                ; encoding: [0xce,0x29,0x61,0x0e]
+; CHECK: xtn2.8h v14, v14               ; encoding: [0xce,0x29,0x61,0x4e]
+; CHECK: xtn.2s v14, v14                ; encoding: [0xce,0x29,0xa1,0x0e]
+; CHECK: xtn2.4s v14, v14               ; encoding: [0xce,0x29,0xa1,0x4e]
+
+; uaddl verbose mode aliases
+ uaddl v9.8h, v13.8b, v14.8b
+ uaddl2 v9.8h, v13.16b, v14.16b
+ uaddl v9.4s, v13.4h, v14.4h
+ uaddl2 v9.4s, v13.8h, v14.8h
+ uaddl v9.2d, v13.2s, v14.2s
+ uaddl2 v9.2d, v13.4s, v14.4s
+; CHECK: uaddl.8h	v9, v13, v14    ; encoding: [0xa9,0x01,0x2e,0x2e]
+; CHECK: uaddl2.8h	v9, v13, v14    ; encoding: [0xa9,0x01,0x2e,0x6e]
+; CHECK: uaddl.4s	v9, v13, v14    ; encoding: [0xa9,0x01,0x6e,0x2e]
+; CHECK: uaddl2.4s	v9, v13, v14    ; encoding: [0xa9,0x01,0x6e,0x6e]
+; CHECK: uaddl.2d	v9, v13, v14    ; encoding: [0xa9,0x01,0xae,0x2e]
+; CHECK: uaddl2.2d	v9, v13, v14    ; encoding: [0xa9,0x01,0xae,0x6e]
+
+; bit verbose mode aliases
+ bit v9.16b, v10.16b, v10.16b
+ bit v9.8b, v10.8b, v10.8b
+; CHECK: bit.16b v9, v10, v10           ; encoding: [0x49,0x1d,0xaa,0x6e]
+; CHECK: bit.8b v9, v10, v10            ; encoding: [0x49,0x1d,0xaa,0x2e]
+
+; pmull verbose mode aliases
+ pmull v8.8h, v8.8b, v8.8b
+ pmull2 v8.8h, v8.16b, v8.16b
+ pmull v8.1q, v8.1d, v8.1d
+ pmull2 v8.1q, v8.2d, v8.2d
+; CHECK: pmull.8h	v8, v8, v8      ; encoding: [0x08,0xe1,0x28,0x0e]
+; CHECK: pmull2.8h	v8, v8, v8      ; encoding: [0x08,0xe1,0x28,0x4e]
+; CHECK: pmull.1q	v8, v8, v8      ; encoding: [0x08,0xe1,0xe8,0x0e]
+; CHECK: pmull2.1q	v8, v8, v8      ; encoding: [0x08,0xe1,0xe8,0x4e]
+
+; usubl verbose mode aliases
+ usubl v9.8h, v13.8b, v14.8b
+ usubl2 v9.8h, v13.16b, v14.16b
+ usubl v9.4s, v13.4h, v14.4h
+ usubl2 v9.4s, v13.8h, v14.8h
+ usubl v9.2d, v13.2s, v14.2s
+ usubl2 v9.2d, v13.4s, v14.4s
+; CHECK: usubl.8h	v9, v13, v14    ; encoding: [0xa9,0x21,0x2e,0x2e]
+; CHECK: usubl2.8h	v9, v13, v14    ; encoding: [0xa9,0x21,0x2e,0x6e]
+; CHECK: usubl.4s	v9, v13, v14    ; encoding: [0xa9,0x21,0x6e,0x2e]
+; CHECK: usubl2.4s	v9, v13, v14    ; encoding: [0xa9,0x21,0x6e,0x6e]
+; CHECK: usubl.2d	v9, v13, v14    ; encoding: [0xa9,0x21,0xae,0x2e]
+; CHECK: usubl2.2d	v9, v13, v14    ; encoding: [0xa9,0x21,0xae,0x6e]
+
+; uabdl verbose mode aliases
+ uabdl v9.8h, v13.8b, v14.8b
+ uabdl2 v9.8h, v13.16b, v14.16b
+ uabdl v9.4s, v13.4h, v14.4h
+ uabdl2 v9.4s, v13.8h, v14.8h
+ uabdl v9.2d, v13.2s, v14.2s
+ uabdl2 v9.2d, v13.4s, v14.4s
+; CHECK: uabdl.8h	v9, v13, v14    ; encoding: [0xa9,0x71,0x2e,0x2e]
+; CHECK: uabdl2.8h	v9, v13, v14    ; encoding: [0xa9,0x71,0x2e,0x6e]
+; CHECK: uabdl.4s	v9, v13, v14    ; encoding: [0xa9,0x71,0x6e,0x2e]
+; CHECK: uabdl2.4s	v9, v13, v14    ; encoding: [0xa9,0x71,0x6e,0x6e]
+; CHECK: uabdl.2d	v9, v13, v14    ; encoding: [0xa9,0x71,0xae,0x2e]
+; CHECK: uabdl2.2d	v9, v13, v14    ; encoding: [0xa9,0x71,0xae,0x6e]
+
+; umull verbose mode aliases
+ umull v9.8h, v13.8b, v14.8b
+ umull2 v9.8h, v13.16b, v14.16b
+ umull v9.4s, v13.4h, v14.4h
+ umull2 v9.4s, v13.8h, v14.8h
+ umull v9.2d, v13.2s, v14.2s
+ umull2 v9.2d, v13.4s, v14.4s
+; CHECK: umull.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x2e]
+; CHECK: umull2.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x6e]
+; CHECK: umull.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x2e]
+; CHECK: umull2.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x6e]
+; CHECK: umull.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x2e]
+; CHECK: umull2.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x6e]
+
+; smull verbose mode aliases
+ smull v9.8h, v13.8b, v14.8b
+ smull2 v9.8h, v13.16b, v14.16b
+ smull v9.4s, v13.4h, v14.4h
+ smull2 v9.4s, v13.8h, v14.8h
+ smull v9.2d, v13.2s, v14.2s
+ smull2 v9.2d, v13.4s, v14.4s
+; CHECK: smull.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x0e]
+; CHECK: smull2.8h	v9, v13, v14    ; encoding: [0xa9,0xc1,0x2e,0x4e]
+; CHECK: smull.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x0e]
+; CHECK: smull2.4s	v9, v13, v14    ; encoding: [0xa9,0xc1,0x6e,0x4e]
+; CHECK: smull.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x0e]
+; CHECK: smull2.2d	v9, v13, v14    ; encoding: [0xa9,0xc1,0xae,0x4e]
diff --git a/test/MC/ARM64/aliases.s b/test/MC/ARM64/aliases.s
new file mode 100644
index 0000000000..055edb56ec
--- /dev/null
+++ b/test/MC/ARM64/aliases.s
@@ -0,0 +1,733 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -output-asm-variant=1 -show-encoding < %s | FileCheck %s
+
+foo:
+;-----------------------------------------------------------------------------
+; ADD #0 to/from SP/WSP is a MOV
+;-----------------------------------------------------------------------------
+  add x1, sp, #0
+; CHECK: mov x1, sp
+  add sp, x2, #0
+; CHECK: mov sp, x2
+  add w3, wsp, #0
+; CHECK: mov w3, wsp
+  add wsp, w4, #0
+; CHECK: mov wsp, w4
+  mov x5, sp
+; CHECK: mov x5, sp
+  mov sp, x6
+; CHECK: mov sp, x6
+  mov w7, wsp
+; CHECK: mov w7, wsp
+  mov wsp, w8
+; CHECK: mov wsp, w8
+
+;-----------------------------------------------------------------------------
+; ORR Rd, Rn, Rn is a MOV
+;-----------------------------------------------------------------------------
+  orr x2, xzr, x9
+; CHECK: mov x2, x9
+  orr w2, wzr, w9
+; CHECK: mov w2, w9
+  mov x3, x4
+; CHECK: mov x3, x4
+  mov w5, w6
+; CHECK: mov w5, w6
+
+;-----------------------------------------------------------------------------
+; TST Xn, #<imm>
+;-----------------------------------------------------------------------------
+        tst w1, #3
+        tst x1, #3
+        tst w1, w2
+        tst x1, x2
+        ands wzr, w1, w2, lsl #2
+        ands xzr, x1, x2, lsl #3
+        tst w3, w7, lsl #31
+        tst x2, x20, asr #0
+
+; CHECK: tst	w1, #0x3                ; encoding: [0x3f,0x04,0x00,0x72]
+; CHECK: tst	x1, #0x3                ; encoding: [0x3f,0x04,0x40,0xf2]
+; CHECK: tst	w1, w2                  ; encoding: [0x3f,0x00,0x02,0x6a]
+; CHECK: tst	x1, x2                  ; encoding: [0x3f,0x00,0x02,0xea]
+; CHECK: tst	w1, w2, lsl #2          ; encoding: [0x3f,0x08,0x02,0x6a]
+; CHECK: tst	x1, x2, lsl #3          ; encoding: [0x3f,0x0c,0x02,0xea]
+; CHECK: tst	w3, w7, lsl #31         ; encoding: [0x7f,0x7c,0x07,0x6a]
+; CHECK: tst	x2, x20, asr #0         ; encoding: [0x5f,0x00,0x94,0xea]
+
+;-----------------------------------------------------------------------------
+; ADDS to WZR/XZR is a CMN
+;-----------------------------------------------------------------------------
+  cmn w1, #3, lsl #0
+  cmn x2, #4194304
+  cmn w4, w5
+  cmn x6, x7
+  cmn w8, w9, asr #3
+  cmn x2, x3, lsr #4
+  cmn x2, w3, uxtb #1
+  cmn x4, x5, uxtx #1
+
+; CHECK: cmn	w1, #3                  ; encoding: [0x3f,0x0c,0x00,0x31]
+; CHECK: cmn	x2, #4194304            ; encoding: [0x5f,0x00,0x50,0xb1]
+; CHECK: cmn	w4, w5                  ; encoding: [0x9f,0x00,0x05,0x2b]
+; CHECK: cmn	x6, x7                  ; encoding: [0xdf,0x00,0x07,0xab]
+; CHECK: cmn	w8, w9, asr #3          ; encoding: [0x1f,0x0d,0x89,0x2b]
+; CHECK: cmn	x2, x3, lsr #4          ; encoding: [0x5f,0x10,0x43,0xab]
+; CHECK: cmn	x2, w3, uxtb #1         ; encoding: [0x5f,0x04,0x23,0xab]
+; CHECK: cmn	x4, x5, uxtx #1         ; encoding: [0x9f,0x64,0x25,0xab]
+
+
+;-----------------------------------------------------------------------------
+; SUBS to WZR/XZR is a CMP
+;-----------------------------------------------------------------------------
+  cmp w1, #1024, lsl #12
+  cmp x2, #1024
+  cmp w4, w5
+  cmp x6, x7
+  cmp w8, w9, asr #3
+  cmp x2, x3, lsr #4
+  cmp x2, w3, uxth #2
+  cmp x4, x5, uxtx
+  cmp wzr, w1
+  cmp x8, w8, uxtw
+  cmp w9, w8, uxtw
+  cmp wsp, w9, lsl #0
+
+; CHECK: cmp	w1, #4194304            ; encoding: [0x3f,0x00,0x50,0x71]
+; CHECK: cmp	x2, #1024               ; encoding: [0x5f,0x00,0x10,0xf1]
+; CHECK: cmp	w4, w5                  ; encoding: [0x9f,0x00,0x05,0x6b]
+; CHECK: cmp	x6, x7                  ; encoding: [0xdf,0x00,0x07,0xeb]
+; CHECK: cmp	w8, w9, asr #3          ; encoding: [0x1f,0x0d,0x89,0x6b]
+; CHECK: cmp	x2, x3, lsr #4          ; encoding: [0x5f,0x10,0x43,0xeb]
+; CHECK: cmp	x2, w3, uxth #2         ; encoding: [0x5f,0x28,0x23,0xeb]
+; CHECK: cmp	x4, x5, uxtx            ; encoding: [0x9f,0x60,0x25,0xeb]
+; CHECK: cmp	wzr, w1                 ; encoding: [0xff,0x03,0x01,0x6b]
+; CHECK: cmp	x8, w8, uxtw            ; encoding: [0x1f,0x41,0x28,0xeb]
+; CHECK: cmp	w9, w8, uxtw            ; encoding: [0x3f,0x41,0x28,0x6b]
+; CHECK: cmp	wsp, w9                 ; encoding: [0xff,0x63,0x29,0x6b]
+
+
+;-----------------------------------------------------------------------------
+; SUB/SUBS from WZR/XZR is a NEG
+;-----------------------------------------------------------------------------
+
+  neg w0, w1
+; CHECK: neg w0, w1
+  neg w0, w1, lsl #1
+; CHECK: sub w0, wzr, w1, lsl #1
+  neg x0, x1
+; CHECK: neg x0, x1
+  neg x0, x1, asr #1
+; CHECK: sub x0, xzr, x1, asr #1
+  negs w0, w1
+; CHECK: negs w0, w1
+  negs w0, w1, lsl #1
+; CHECK: subs w0, wzr, w1, lsl #1
+  negs x0, x1
+; CHECK: negs x0, x1
+  negs x0, x1, asr #1
+; CHECK: subs x0, xzr, x1, asr #1
+
+;-----------------------------------------------------------------------------
+; MOV aliases
+;-----------------------------------------------------------------------------
+
+  mov x0, #281470681743360
+  mov x0, #18446744073709486080
+
+; CHECK: movz	x0, #65535, lsl #32
+; CHECK: movn	x0, #65535
+
+  mov w0, #0xffffffff
+  mov w0, #0xffffff00
+
+; CHECK: movn   w0, #0
+; CHECK: movn   w0, #255
+
+;-----------------------------------------------------------------------------
+; MVN aliases
+;-----------------------------------------------------------------------------
+
+        mvn w4, w9
+        mvn x2, x3
+        orn w4, wzr, w9
+
+; CHECK: mvn	w4, w9             ; encoding: [0xe4,0x03,0x29,0x2a]
+; CHECK: mvn	x2, x3             ; encoding: [0xe2,0x03,0x23,0xaa]
+; CHECK: mvn	w4, w9             ; encoding: [0xe4,0x03,0x29,0x2a]
+
+;-----------------------------------------------------------------------------
+; Bitfield aliases
+;-----------------------------------------------------------------------------
+
+  bfi   w0, w0, #1, #4
+  bfi   x0, x0, #1, #4
+  bfi   w0, w0, #0, #2
+  bfi   x0, x0, #0, #2
+  bfxil w0, w0, #2, #3
+  bfxil x0, x0, #2, #3
+  sbfiz w0, w0, #1, #4
+  sbfiz x0, x0, #1, #4
+  sbfx  w0, w0, #2, #3
+  sbfx  x0, x0, #2, #3
+  ubfiz w0, w0, #1, #4
+  ubfiz x0, x0, #1, #4
+  ubfx  w0, w0, #2, #3
+  ubfx  x0, x0, #2, #3
+
+; CHECK: bfm  w0, w0, #31, #3
+; CHECK: bfm  x0, x0, #63, #3
+; CHECK: bfm  w0, w0, #0, #1
+; CHECK: bfm  x0, x0, #0, #1
+; CHECK: bfm  w0, w0, #2, #4
+; CHECK: bfm  x0, x0, #2, #4
+; CHECK: sbfm w0, w0, #31, #3
+; CHECK: sbfm x0, x0, #63, #3
+; CHECK: sbfm w0, w0, #2, #4
+; CHECK: sbfm x0, x0, #2, #4
+; CHECK: ubfm w0, w0, #31, #3
+; CHECK: ubfm x0, x0, #63, #3
+; CHECK: ubfm w0, w0, #2, #4
+; CHECK: ubfm x0, x0, #2, #4
+
+;-----------------------------------------------------------------------------
+; Shift (immediate) aliases
+;-----------------------------------------------------------------------------
+
+; CHECK: asr w1, w3, #13
+; CHECK: asr x1, x3, #13
+; CHECK: lsl w0, w0, #1
+; CHECK: lsl x0, x0, #1
+; CHECK: lsr w0, w0, #4
+; CHECK: lsr x0, x0, #4
+
+   sbfm w1, w3, #13, #31
+   sbfm x1, x3, #13, #63
+   ubfm w0, w0, #31, #30
+   ubfm x0, x0, #63, #62
+   ubfm w0, w0, #4, #31
+   ubfm x0, x0, #4, #63
+; CHECK: extr w1, w3, w3, #5
+; CHECK: extr x1, x3, x3, #5
+   ror w1, w3, #5
+   ror x1, x3, #5
+; CHECK: lsl w1, wzr, #3
+   lsl w1, wzr, #3
+
+;-----------------------------------------------------------------------------
+; Sign/Zero extend aliases
+;-----------------------------------------------------------------------------
+
+  sxtb  w1, w2
+  sxth  w1, w2
+  uxtb  w1, w2
+  uxth  w1, w2
+
+; CHECK: sxtb w1, w2
+; CHECK: sxth w1, w2
+; CHECK: uxtb w1, w2
+; CHECK: uxth w1, w2
+
+  sxtb  x1, x2
+  sxth  x1, x2
+  sxtw  x1, x2
+  uxtb  x1, x2
+  uxth  x1, x2
+  uxtw  x1, x2
+
+; CHECK: sxtb x1, x2
+; CHECK: sxth x1, x2
+; CHECK: sxtw x1, x2
+; CHECK: uxtb x1, x2
+; CHECK: uxth x1, x2
+; CHECK: uxtw x1, x2
+
+;-----------------------------------------------------------------------------
+; Negate with carry
+;-----------------------------------------------------------------------------
+
+  ngc   w1, w2
+  ngc   x1, x2
+  ngcs  w1, w2
+  ngcs  x1, x2
+
+; CHECK: ngc  w1, w2
+; CHECK: ngc  x1, x2
+; CHECK: ngcs w1, w2
+; CHECK: ngcs x1, x2
+
+;-----------------------------------------------------------------------------
+; 6.6.1 Multiply aliases
+;-----------------------------------------------------------------------------
+
+  mneg   w1, w2, w3
+  mneg   x1, x2, x3
+  mul    w1, w2, w3
+  mul    x1, x2, x3
+  smnegl x1, w2, w3
+  umnegl x1, w2, w3
+  smull   x1, w2, w3
+  umull   x1, w2, w3
+
+; CHECK: mneg w1, w2, w3
+; CHECK: mneg x1, x2, x3
+; CHECK: mul w1, w2, w3
+; CHECK: mul x1, x2, x3
+; CHECK: smnegl x1, w2, w3
+; CHECK: umnegl x1, w2, w3
+; CHECK: smull x1, w2, w3
+; CHECK: umull x1, w2, w3
+
+;-----------------------------------------------------------------------------
+; Conditional select aliases
+;-----------------------------------------------------------------------------
+
+  cset   w1, eq
+  cset   x1, eq
+  csetm  w1, ne
+  csetm  x1, ne
+  cinc   w1, w2, lt
+  cinc   x1, x2, lt
+  cinv   w1, w2, mi
+  cinv   x1, x2, mi
+
+; CHECK: csinc  w1, wzr, wzr, ne
+; CHECK: csinc  x1, xzr, xzr, ne
+; CHECK: csinv  w1, wzr, wzr, eq
+; CHECK: csinv  x1, xzr, xzr, eq
+; CHECK: csinc  w1, w2, w2, ge
+; CHECK: csinc  x1, x2, x2, ge
+; CHECK: csinv  w1, w2, w2, pl
+; CHECK: csinv  x1, x2, x2, pl
+
+;-----------------------------------------------------------------------------
+; SYS aliases
+;-----------------------------------------------------------------------------
+
+  sys #0, c7, c1, #0
+; CHECK: ic ialluis
+  sys #0, c7, c5, #0
+; CHECK: ic iallu
+  sys #3, c7, c5, #1
+; CHECK: ic ivau
+
+  sys #3, c7, c4, #1
+; CHECK: dc zva
+  sys #0, c7, c6, #1
+; CHECK: dc ivac
+  sys #0, c7, c6, #2
+; CHECK: dc isw
+  sys #3, c7, c10, #1
+; CHECK: dc cvac
+  sys #0, c7, c10, #2
+; CHECK: dc csw
+  sys #3, c7, c11, #1
+; CHECK: dc cvau
+  sys #3, c7, c14, #1
+; CHECK: dc civac
+  sys #0, c7, c14, #2
+; CHECK: dc cisw
+
+  sys #0, c7, c8, #0
+; CHECK: at s1e1r
+  sys #4, c7, c8, #0
+; CHECK: at s1e2r
+  sys #6, c7, c8, #0
+; CHECK: at s1e3r
+  sys #0, c7, c8, #1
+; CHECK: at s1e1w
+  sys #4, c7, c8, #1
+; CHECK: at s1e2w
+  sys #6, c7, c8, #1
+; CHECK: at s1e3w
+  sys #0, c7, c8, #2
+; CHECK: at s1e0r
+  sys #0, c7, c8, #3
+; CHECK: at s1e0w
+  sys #4, c7, c8, #4
+; CHECK: at s12e1r
+  sys #4, c7, c8, #5
+; CHECK: at s12e1w
+  sys #4, c7, c8, #6
+; CHECK: at s12e0r
+  sys #4, c7, c8, #7
+; CHECK: at s12e0w
+
+  sys #0, c8, c3, #0
+; CHECK: tlbi vmalle1is
+  sys #4, c8, c3, #0
+; CHECK: tlbi alle2is
+  sys #6, c8, c3, #0
+; CHECK: tlbi alle3is
+  sys #0, c8, c3, #1
+; CHECK: tlbi vae1is
+  sys #4, c8, c3, #1
+; CHECK: tlbi vae2is
+  sys #6, c8, c3, #1
+; CHECK: tlbi vae3is
+  sys #0, c8, c3, #2
+; CHECK: tlbi aside1is
+  sys #0, c8, c3, #3
+; CHECK: tlbi vaae1is
+  sys #4, c8, c3, #4
+; CHECK: tlbi alle1is
+  sys #0, c8, c3, #5
+; CHECK: tlbi vale1is
+  sys #0, c8, c3, #7
+; CHECK: tlbi vaale1is
+  sys #0, c8, c7, #0
+; CHECK: tlbi vmalle1
+  sys #4, c8, c7, #0
+; CHECK: tlbi alle2
+  sys #4, c8, c3, #5
+; CHECK: tlbi vale2is
+  sys #6, c8, c3, #5
+; CHECK: tlbi vale3is
+  sys #6, c8, c7, #0
+; CHECK: tlbi alle3
+  sys #0, c8, c7, #1
+; CHECK: tlbi vae1
+  sys #4, c8, c7, #1
+; CHECK: tlbi vae2
+  sys #6, c8, c7, #1
+; CHECK: tlbi vae3
+  sys #0, c8, c7, #2
+; CHECK: tlbi aside1
+  sys #0, c8, c7, #3
+; CHECK: tlbi vaae1
+  sys #4, c8, c7, #4
+; CHECK: tlbi alle1
+  sys #0, c8, c7, #5
+; CHECK: tlbi vale1
+  sys #4, c8, c7, #5
+; CHECK: tlbi vale2
+  sys #6, c8, c7, #5
+; CHECK: tlbi vale3
+  sys #0, c8, c7, #7
+; CHECK: tlbi vaale1
+  sys #4, c8, c4, #1
+; CHECK: tlbi ipas2e1
+  sys #4, c8, c4, #5
+; CHECK: tlbi ipas2le1
+  sys #4, c8, c7, #6
+; CHECK: tlbi vmalls12e1
+  sys #4, c8, c3, #6
+; CHECK: tlbi vmalls12e1is
+
+  ic ialluis
+; CHECK: ic ialluis
+  ic iallu
+; CHECK: ic iallu
+  ic ivau
+; CHECK: ic ivau
+
+  dc zva
+; CHECK: dc zva
+  dc ivac
+; CHECK: dc ivac
+  dc isw
+; CHECK: dc isw
+  dc cvac
+; CHECK: dc cvac
+  dc csw
+; CHECK: dc csw
+  dc cvau
+; CHECK: dc cvau
+  dc civac
+; CHECK: dc civac
+  dc cisw
+; CHECK: dc cisw
+
+  at s1e1r
+; CHECK: at s1e1r
+  at s1e2r
+; CHECK: at s1e2r
+  at s1e3r
+; CHECK: at s1e3r
+  at s1e1w
+; CHECK: at s1e1w
+  at s1e2w
+; CHECK: at s1e2w
+  at s1e3w
+; CHECK: at s1e3w
+  at s1e0r
+; CHECK: at s1e0r
+  at s1e0w
+; CHECK: at s1e0w
+  at s12e1r
+; CHECK: at s12e1r
+  at s12e1w
+; CHECK: at s12e1w
+  at s12e0r
+; CHECK: at s12e0r
+  at s12e0w
+; CHECK: at s12e0w
+
+  tlbi vmalle1is
+; CHECK: tlbi vmalle1is
+  tlbi alle2is
+; CHECK: tlbi alle2is
+  tlbi alle3is
+; CHECK: tlbi alle3is
+  tlbi vae1is
+; CHECK: tlbi vae1is
+  tlbi vae2is
+; CHECK: tlbi vae2is
+  tlbi vae3is
+; CHECK: tlbi vae3is
+  tlbi aside1is
+; CHECK: tlbi aside1is
+  tlbi vaae1is
+; CHECK: tlbi vaae1is
+  tlbi alle1is
+; CHECK: tlbi alle1is
+  tlbi vale1is
+; CHECK: tlbi vale1is
+  tlbi vaale1is
+; CHECK: tlbi vaale1is
+  tlbi vmalle1
+; CHECK: tlbi vmalle1
+  tlbi alle2
+; CHECK: tlbi alle2
+  tlbi vale2is
+; CHECK: tlbi vale2is
+  tlbi vale3is
+; CHECK: tlbi vale3is
+  tlbi alle3
+; CHECK: tlbi alle3
+  tlbi vae1
+; CHECK: tlbi vae1
+  tlbi vae2
+; CHECK: tlbi vae2
+  tlbi vae3
+; CHECK: tlbi vae3
+  tlbi aside1
+; CHECK: tlbi aside1
+  tlbi vaae1
+; CHECK: tlbi vaae1
+  tlbi alle1
+; CHECK: tlbi alle1
+  tlbi vale1
+; CHECK: tlbi vale1
+  tlbi vale2
+; CHECK: tlbi vale2
+  tlbi vale3
+; CHECK: tlbi vale3
+  tlbi vaale1
+; CHECK: tlbi vaale1
+  tlbi ipas2e1, x10
+; CHECK: tlbi ipas2e1, x10
+  tlbi ipas2le1, x1
+; CHECK: tlbi ipas2le1, x1
+  tlbi vmalls12e1
+; CHECK: tlbi vmalls12e1
+  tlbi vmalls12e1is
+; CHECK: tlbi vmalls12e1is
+
+;-----------------------------------------------------------------------------
+; 5.8.5 Vector Arithmetic aliases
+;-----------------------------------------------------------------------------
+
+  cmls.8b v0, v2, v1
+  cmls.16b v0, v2, v1
+  cmls.4h v0, v2, v1
+  cmls.8h v0, v2, v1
+  cmls.2s v0, v2, v1
+  cmls.4s v0, v2, v1
+  cmls.2d v0, v2, v1
+; CHECK: cmhs.8b v0, v1, v2
+; CHECK: cmhs.16b v0, v1, v2
+; CHECK: cmhs.4h v0, v1, v2
+; CHECK: cmhs.8h v0, v1, v2
+; CHECK: cmhs.2s v0, v1, v2
+; CHECK: cmhs.4s v0, v1, v2
+; CHECK: cmhs.2d v0, v1, v2
+
+  cmlo.8b v0, v2, v1
+  cmlo.16b v0, v2, v1
+  cmlo.4h v0, v2, v1
+  cmlo.8h v0, v2, v1
+  cmlo.2s v0, v2, v1
+  cmlo.4s v0, v2, v1
+  cmlo.2d v0, v2, v1
+; CHECK: cmhi.8b v0, v1, v2
+; CHECK: cmhi.16b v0, v1, v2
+; CHECK: cmhi.4h v0, v1, v2
+; CHECK: cmhi.8h v0, v1, v2
+; CHECK: cmhi.2s v0, v1, v2
+; CHECK: cmhi.4s v0, v1, v2
+; CHECK: cmhi.2d v0, v1, v2
+
+  cmle.8b v0, v2, v1
+  cmle.16b v0, v2, v1
+  cmle.4h v0, v2, v1
+  cmle.8h  v0, v2, v1
+  cmle.2s v0, v2, v1
+  cmle.4s v0, v2, v1
+  cmle.2d v0, v2, v1
+; CHECK: cmge.8b v0, v1, v2
+; CHECK: cmge.16b v0, v1, v2
+; CHECK: cmge.4h v0, v1, v2
+; CHECK: cmge.8h v0, v1, v2
+; CHECK: cmge.2s v0, v1, v2
+; CHECK: cmge.4s v0, v1, v2
+; CHECK: cmge.2d v0, v1, v2
+
+  cmlt.8b v0, v2, v1
+  cmlt.16b v0, v2, v1
+  cmlt.4h v0, v2, v1
+  cmlt.8h  v0, v2, v1
+  cmlt.2s v0, v2, v1
+  cmlt.4s v0, v2, v1
+  cmlt.2d v0, v2, v1
+; CHECK: cmgt.8b v0, v1, v2
+; CHECK: cmgt.16b v0, v1, v2
+; CHECK: cmgt.4h v0, v1, v2
+; CHECK: cmgt.8h v0, v1, v2
+; CHECK: cmgt.2s v0, v1, v2
+; CHECK: cmgt.4s v0, v1, v2
+; CHECK: cmgt.2d v0, v1, v2
+
+  fcmle.2s v0, v2, v1
+  fcmle.4s v0, v2, v1
+  fcmle.2d v0, v2, v1
+; CHECK: fcmge.2s v0, v1, v2
+; CHECK: fcmge.4s v0, v1, v2
+; CHECK: fcmge.2d v0, v1, v2
+
+  fcmlt.2s v0, v2, v1
+  fcmlt.4s v0, v2, v1
+  fcmlt.2d v0, v2, v1
+; CHECK: fcmgt.2s v0, v1, v2
+; CHECK: fcmgt.4s v0, v1, v2
+; CHECK: fcmgt.2d v0, v1, v2
+
+  facle.2s v0, v2, v1
+  facle.4s v0, v2, v1
+  facle.2d v0, v2, v1
+; CHECK: facge.2s v0, v1, v2
+; CHECK: facge.4s v0, v1, v2
+; CHECK: facge.2d v0, v1, v2
+
+  faclt.2s v0, v2, v1
+  faclt.4s v0, v2, v1
+  faclt.2d v0, v2, v1
+; CHECK: facgt.2s v0, v1, v2
+; CHECK: facgt.4s v0, v1, v2
+; CHECK: facgt.2d v0, v1, v2
+
+;-----------------------------------------------------------------------------
+; 5.8.6 Scalar Arithmetic aliases
+;-----------------------------------------------------------------------------
+
+  cmls d0, d2, d1
+; CHECK: cmhs d0, d1, d2
+
+  cmle d0, d2, d1
+; CHECK: cmge d0, d1, d2
+
+  cmlo d0, d2, d1
+; CHECK: cmhi d0, d1, d2
+
+  cmlt d0, d2, d1
+; CHECK: cmgt d0, d1, d2
+
+  fcmle s0, s2, s1
+  fcmle d0, d2, d1
+; CHECK: fcmge s0, s1, s2
+; CHECK: fcmge d0, d1, d2
+
+  fcmlt s0, s2, s1
+  fcmlt d0, d2, d1
+; CHECK: fcmgt s0, s1, s2
+; CHECK: fcmgt d0, d1, d2
+
+  facle s0, s2, s1
+  facle d0, d2, d1
+; CHECK: facge s0, s1, s2
+; CHECK: facge d0, d1, d2
+
+  faclt s0, s2, s1
+  faclt d0, d2, d1
+; CHECK: facgt s0, s1, s2
+; CHECK: facgt d0, d1, d2
+
+;-----------------------------------------------------------------------------
+; 5.8.14 Vector Shift (immediate)
+;-----------------------------------------------------------------------------
+  sxtl v1.8h, v2.8b
+; CHECK: sshll.8h v1, v2, #0
+  sxtl.8h v1, v2
+; CHECK: sshll.8h v1, v2, #0
+
+  sxtl v1.4s, v2.4h
+; CHECK: sshll.4s v1, v2, #0
+  sxtl.4s v1, v2
+; CHECK: sshll.4s v1, v2, #0
+
+  sxtl v1.2d, v2.2s
+; CHECK: sshll.2d v1, v2, #0
+  sxtl.2d v1, v2
+; CHECK: sshll.2d v1, v2, #0
+
+  sxtl2 v1.8h, v2.16b
+; CHECK: sshll2.8h v1, v2, #0
+  sxtl2.8h v1, v2
+; CHECK: sshll2.8h v1, v2, #0
+
+  sxtl2 v1.4s, v2.8h
+; CHECK: sshll2.4s v1, v2, #0
+  sxtl2.4s v1, v2
+; CHECK: sshll2.4s v1, v2, #0
+
+  sxtl2 v1.2d, v2.4s
+; CHECK: sshll2.2d v1, v2, #0
+  sxtl2.2d v1, v2
+; CHECK: sshll2.2d v1, v2, #0
+
+  uxtl v1.8h, v2.8b
+; CHECK: ushll.8h v1, v2, #0
+  uxtl.8h v1, v2
+; CHECK: ushll.8h v1, v2, #0
+
+  uxtl v1.4s, v2.4h
+; CHECK: ushll.4s v1, v2, #0
+  uxtl.4s v1, v2
+; CHECK: ushll.4s v1, v2, #0
+
+  uxtl v1.2d, v2.2s
+; CHECK: ushll.2d v1, v2, #0
+  uxtl.2d v1, v2
+; CHECK: ushll.2d v1, v2, #0
+
+  uxtl2 v1.8h, v2.16b
+; CHECK: ushll2.8h v1, v2, #0
+  uxtl2.8h v1, v2
+; CHECK: ushll2.8h v1, v2, #0
+
+  uxtl2 v1.4s, v2.8h
+; CHECK: ushll2.4s v1, v2, #0
+  uxtl2.4s v1, v2
+; CHECK: ushll2.4s v1, v2, #0
+
+  uxtl2 v1.2d, v2.4s
+; CHECK: ushll2.2d v1, v2, #0
+  uxtl2.2d v1, v2
+; CHECK: ushll2.2d v1, v2, #0
+
+
+;-----------------------------------------------------------------------------
+; MOVI verbose syntax with shift operand omitted.
+;-----------------------------------------------------------------------------
+  movi v4.16b, #0x00
+  movi v4.16B, #0x01
+  movi v4.8b, #0x02
+  movi v4.8B, #0x03
+  movi v1.2d, #0x000000000000ff
+  movi v2.2D, #0x000000000000ff
+
+; CHECK: movi.16b	v4, #0              ; encoding: [0x04,0xe4,0x00,0x4f]
+; CHECK: movi.16b	v4, #1              ; encoding: [0x24,0xe4,0x00,0x4f]
+; CHECK: movi.8b	v4, #2               ; encoding: [0x44,0xe4,0x00,0x0f]
+; CHECK: movi.8b	v4, #3               ; encoding: [0x64,0xe4,0x00,0x0f]
+; CHECK: movi.2d	v1, #0x000000000000ff ; encoding: [0x21,0xe4,0x00,0x6f]
+; CHECK: movi.2d	v2, #0x000000000000ff ; encoding: [0x22,0xe4,0x00,0x6f]
diff --git a/test/MC/ARM64/arithmetic-encoding.s b/test/MC/ARM64/arithmetic-encoding.s
new file mode 100644
index 0000000000..7c89244b72
--- /dev/null
+++ b/test/MC/ARM64/arithmetic-encoding.s
@@ -0,0 +1,631 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+;==---------------------------------------------------------------------------==
+; Add/Subtract with carry/borrow
+;==---------------------------------------------------------------------------==
+
+  adc   w1, w2, w3
+  adc   x1, x2, x3
+  adcs  w5, w4, w3
+  adcs  x5, x4, x3
+
+; CHECK: adc  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x1a]
+; CHECK: adc  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0x9a]
+; CHECK: adcs w5, w4, w3             ; encoding: [0x85,0x00,0x03,0x3a]
+; CHECK: adcs x5, x4, x3             ; encoding: [0x85,0x00,0x03,0xba]
+
+  sbc   w1, w2, w3
+  sbc   x1, x2, x3
+  sbcs  w1, w2, w3
+  sbcs  x1, x2, x3
+
+; CHECK: sbc  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x5a]
+; CHECK: sbc  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xda]
+; CHECK: sbcs w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x7a]
+; CHECK: sbcs x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xfa]
+
+;==---------------------------------------------------------------------------==
+; Add/Subtract with (optionally shifted) immediate
+;==---------------------------------------------------------------------------==
+
+  add w3, w4, #1024
+  add w3, w4, #1024, lsl #0
+  add x3, x4, #1024
+  add x3, x4, #1024, lsl #0
+
+; CHECK: add w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x11]
+; CHECK: add w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x11]
+; CHECK: add x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0x91]
+; CHECK: add x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0x91]
+
+  add w3, w4, #1024, lsl #12
+  add w3, w4, #4194304
+  add w3, w4, #0, lsl #12
+  add x3, x4, #1024, lsl #12
+  add x3, x4, #4194304
+  add x3, x4, #0, lsl #12
+  add sp, sp, #32
+
+; CHECK: add w3, w4, #4194304        ; encoding: [0x83,0x00,0x50,0x11]
+; CHECK: add w3, w4, #4194304        ; encoding: [0x83,0x00,0x50,0x11]
+; CHECK: add w3, w4, #0, lsl #12     ; encoding: [0x83,0x00,0x40,0x11]
+; CHECK: add x3, x4, #4194304        ; encoding: [0x83,0x00,0x50,0x91]
+; CHECK: add x3, x4, #4194304        ; encoding: [0x83,0x00,0x50,0x91]
+; CHECK: add x3, x4, #0, lsl #12     ; encoding: [0x83,0x00,0x40,0x91]
+; CHECK: add sp, sp, #32             ; encoding: [0xff,0x83,0x00,0x91]
+
+  adds w3, w4, #1024
+  adds w3, w4, #1024, lsl #0
+  adds w3, w4, #1024, lsl #12
+  adds x3, x4, #1024
+  adds x3, x4, #1024, lsl #0
+  adds x3, x4, #1024, lsl #12
+
+; CHECK: adds w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x31]
+; CHECK: adds w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x31]
+; CHECK: adds w3, w4, #4194304       ; encoding: [0x83,0x00,0x50,0x31]
+; CHECK: adds x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xb1]
+; CHECK: adds x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xb1]
+; CHECK: adds x3, x4, #4194304       ; encoding: [0x83,0x00,0x50,0xb1]
+
+  sub w3, w4, #1024
+  sub w3, w4, #1024, lsl #0
+  sub w3, w4, #1024, lsl #12
+  sub x3, x4, #1024
+  sub x3, x4, #1024, lsl #0
+  sub x3, x4, #1024, lsl #12
+  sub sp, sp, #32
+
+; CHECK: sub w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x51]
+; CHECK: sub w3, w4, #1024           ; encoding: [0x83,0x00,0x10,0x51]
+; CHECK: sub w3, w4, #4194304        ; encoding: [0x83,0x00,0x50,0x51]
+; CHECK: sub x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0xd1]
+; CHECK: sub x3, x4, #1024           ; encoding: [0x83,0x00,0x10,0xd1]
+; CHECK: sub x3, x4, #4194304        ; encoding: [0x83,0x00,0x50,0xd1]
+; CHECK: sub sp, sp, #32             ; encoding: [0xff,0x83,0x00,0xd1]
+
+  subs w3, w4, #1024
+  subs w3, w4, #1024, lsl #0
+  subs w3, w4, #1024, lsl #12
+  subs x3, x4, #1024
+  subs x3, x4, #1024, lsl #0
+  subs x3, x4, #1024, lsl #12
+
+; CHECK: subs w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x71]
+; CHECK: subs w3, w4, #1024          ; encoding: [0x83,0x00,0x10,0x71]
+; CHECK: subs w3, w4, #4194304       ; encoding: [0x83,0x00,0x50,0x71]
+; CHECK: subs x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xf1]
+; CHECK: subs x3, x4, #1024          ; encoding: [0x83,0x00,0x10,0xf1]
+; CHECK: subs x3, x4, #4194304       ; encoding: [0x83,0x00,0x50,0xf1]
+
+;==---------------------------------------------------------------------------==
+; Add/Subtract register with (optional) shift
+;==---------------------------------------------------------------------------==
+
+  add w12, w13, w14
+  add x12, x13, x14
+  add w12, w13, w14, lsl #12
+  add x12, x13, x14, lsl #12
+  add w12, w13, w14, lsr #42
+  add x12, x13, x14, lsr #42
+  add w12, w13, w14, asr #39
+  add x12, x13, x14, asr #39
+
+; CHECK: add w12, w13, w14           ; encoding: [0xac,0x01,0x0e,0x0b]
+; CHECK: add x12, x13, x14           ; encoding: [0xac,0x01,0x0e,0x8b]
+; CHECK: add w12, w13, w14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x0b]
+; CHECK: add x12, x13, x14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x8b]
+; CHECK: add w12, w13, w14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0x0b]
+; CHECK: add x12, x13, x14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0x8b]
+; CHECK: add w12, w13, w14, asr #39  ; encoding: [0xac,0x9d,0x8e,0x0b]
+; CHECK: add x12, x13, x14, asr #39  ; encoding: [0xac,0x9d,0x8e,0x8b]
+
+  sub w12, w13, w14
+  sub x12, x13, x14
+  sub w12, w13, w14, lsl #12
+  sub x12, x13, x14, lsl #12
+  sub w12, w13, w14, lsr #42
+  sub x12, x13, x14, lsr #42
+  sub w12, w13, w14, asr #39
+  sub x12, x13, x14, asr #39
+
+; CHECK: sub w12, w13, w14           ; encoding: [0xac,0x01,0x0e,0x4b]
+; CHECK: sub x12, x13, x14           ; encoding: [0xac,0x01,0x0e,0xcb]
+; CHECK: sub w12, w13, w14, lsl #12  ; encoding: [0xac,0x31,0x0e,0x4b]
+; CHECK: sub x12, x13, x14, lsl #12  ; encoding: [0xac,0x31,0x0e,0xcb]
+; CHECK: sub w12, w13, w14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0x4b]
+; CHECK: sub x12, x13, x14, lsr #42  ; encoding: [0xac,0xa9,0x4e,0xcb]
+; CHECK: sub w12, w13, w14, asr #39  ; encoding: [0xac,0x9d,0x8e,0x4b]
+; CHECK: sub x12, x13, x14, asr #39  ; encoding: [0xac,0x9d,0x8e,0xcb]
+
+  adds w12, w13, w14
+  adds x12, x13, x14
+  adds w12, w13, w14, lsl #12
+  adds x12, x13, x14, lsl #12
+  adds w12, w13, w14, lsr #42
+  adds x12, x13, x14, lsr #42
+  adds w12, w13, w14, asr #39
+  adds x12, x13, x14, asr #39
+
+; CHECK: adds w12, w13, w14          ; encoding: [0xac,0x01,0x0e,0x2b]
+; CHECK: adds x12, x13, x14          ; encoding: [0xac,0x01,0x0e,0xab]
+; CHECK: adds w12, w13, w14, lsl #12 ; encoding: [0xac,0x31,0x0e,0x2b]
+; CHECK: adds x12, x13, x14, lsl #12 ; encoding: [0xac,0x31,0x0e,0xab]
+; CHECK: adds w12, w13, w14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0x2b]
+; CHECK: adds x12, x13, x14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0xab]
+; CHECK: adds w12, w13, w14, asr #39 ; encoding: [0xac,0x9d,0x8e,0x2b]
+; CHECK: adds x12, x13, x14, asr #39 ; encoding: [0xac,0x9d,0x8e,0xab]
+
+  subs w12, w13, w14
+  subs x12, x13, x14
+  subs w12, w13, w14, lsl #12
+  subs x12, x13, x14, lsl #12
+  subs w12, w13, w14, lsr #42
+  subs x12, x13, x14, lsr #42
+  subs w12, w13, w14, asr #39
+  subs x12, x13, x14, asr #39
+
+; CHECK: subs w12, w13, w14          ; encoding: [0xac,0x01,0x0e,0x6b]
+; CHECK: subs x12, x13, x14          ; encoding: [0xac,0x01,0x0e,0xeb]
+; CHECK: subs w12, w13, w14, lsl #12 ; encoding: [0xac,0x31,0x0e,0x6b]
+; CHECK: subs x12, x13, x14, lsl #12 ; encoding: [0xac,0x31,0x0e,0xeb]
+; CHECK: subs w12, w13, w14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0x6b]
+; CHECK: subs x12, x13, x14, lsr #42 ; encoding: [0xac,0xa9,0x4e,0xeb]
+; CHECK: subs w12, w13, w14, asr #39 ; encoding: [0xac,0x9d,0x8e,0x6b]
+; CHECK: subs x12, x13, x14, asr #39 ; encoding: [0xac,0x9d,0x8e,0xeb]
+
+; Check use of upper case register names rdar://14354073
+  add X2, X2, X2
+; CHECK: add x2, x2, x2              ; encoding: [0x42,0x00,0x02,0x8b]
+
+;==---------------------------------------------------------------------------==
+; Add/Subtract with (optional) extend
+;==---------------------------------------------------------------------------==
+
+  add w1, w2, w3, uxtb
+  add w1, w2, w3, uxth
+  add w1, w2, w3, uxtw
+  add w1, w2, w3, uxtx
+  add w1, w2, w3, sxtb
+  add w1, w2, w3, sxth
+  add w1, w2, w3, sxtw
+  add w1, w2, w3, sxtx
+
+; CHECK: add w1, w2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x0b]
+; CHECK: add w1, w2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x0b]
+; CHECK: add w1, w2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x0b]
+; CHECK: add w1, w2, w3, uxtx        ; encoding: [0x41,0x60,0x23,0x0b]
+; CHECK: add w1, w2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x0b]
+; CHECK: add w1, w2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x0b]
+; CHECK: add w1, w2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x0b]
+; CHECK: add w1, w2, w3, sxtx        ; encoding: [0x41,0xe0,0x23,0x0b]
+
+  add x1, x2, w3, uxtb
+  add x1, x2, w3, uxth
+  add x1, x2, w3, uxtw
+  add x1, x2, w3, sxtb
+  add x1, x2, w3, sxth
+  add x1, x2, w3, sxtw
+
+; CHECK: add x1, x2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x8b]
+; CHECK: add x1, x2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x8b]
+; CHECK: add x1, x2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x8b]
+; CHECK: add x1, x2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x8b]
+; CHECK: add x1, x2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x8b]
+; CHECK: add x1, x2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x8b]
+
+  add w1, wsp, w3
+  add w1, wsp, w3, uxtw #0
+  add w2, wsp, w3, lsl #1
+  add sp, x2, x3
+  add sp, x2, x3, uxtx #0
+
+; CHECK: add w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x0b]
+; CHECK: add w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x0b]
+; CHECK: add w2, wsp, w3, lsl #1     ; encoding: [0xe2,0x67,0x23,0x0b]
+; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
+; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
+
+  sub w1, w2, w3, uxtb
+  sub w1, w2, w3, uxth
+  sub w1, w2, w3, uxtw
+  sub w1, w2, w3, uxtx
+  sub w1, w2, w3, sxtb
+  sub w1, w2, w3, sxth
+  sub w1, w2, w3, sxtw
+  sub w1, w2, w3, sxtx
+
+; CHECK: sub w1, w2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0x4b]
+; CHECK: sub w1, w2, w3, uxth        ; encoding: [0x41,0x20,0x23,0x4b]
+; CHECK: sub w1, w2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0x4b]
+; CHECK: sub w1, w2, w3, uxtx        ; encoding: [0x41,0x60,0x23,0x4b]
+; CHECK: sub w1, w2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0x4b]
+; CHECK: sub w1, w2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0x4b]
+; CHECK: sub w1, w2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0x4b]
+; CHECK: sub w1, w2, w3, sxtx        ; encoding: [0x41,0xe0,0x23,0x4b]
+
+  sub x1, x2, w3, uxtb
+  sub x1, x2, w3, uxth
+  sub x1, x2, w3, uxtw
+  sub x1, x2, w3, sxtb
+  sub x1, x2, w3, sxth
+  sub x1, x2, w3, sxtw
+
+; CHECK: sub x1, x2, w3, uxtb        ; encoding: [0x41,0x00,0x23,0xcb]
+; CHECK: sub x1, x2, w3, uxth        ; encoding: [0x41,0x20,0x23,0xcb]
+; CHECK: sub x1, x2, w3, uxtw        ; encoding: [0x41,0x40,0x23,0xcb]
+; CHECK: sub x1, x2, w3, sxtb        ; encoding: [0x41,0x80,0x23,0xcb]
+; CHECK: sub x1, x2, w3, sxth        ; encoding: [0x41,0xa0,0x23,0xcb]
+; CHECK: sub x1, x2, w3, sxtw        ; encoding: [0x41,0xc0,0x23,0xcb]
+
+  sub w1, wsp, w3
+  sub w1, wsp, w3, uxtw #0
+  sub sp, x2, x3
+  sub sp, x2, x3, uxtx #0
+  sub sp, x3, x7, lsl #4
+
+; CHECK: sub w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x4b]
+; CHECK: sub w1, wsp, w3             ; encoding: [0xe1,0x43,0x23,0x4b]
+; CHECK: sub sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0xcb]
+; CHECK: sub sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0xcb]
+; CHECK: sp, x3, x7, lsl #4          ; encoding: [0x7f,0x70,0x27,0xcb]
+
+  adds w1, w2, w3, uxtb
+  adds w1, w2, w3, uxth
+  adds w1, w2, w3, uxtw
+  adds w1, w2, w3, uxtx
+  adds w1, w2, w3, sxtb
+  adds w1, w2, w3, sxth
+  adds w1, w2, w3, sxtw
+  adds w1, w2, w3, sxtx
+
+; CHECK: adds w1, w2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0x2b]
+; CHECK: adds w1, w2, w3, uxth       ; encoding: [0x41,0x20,0x23,0x2b]
+; CHECK: adds w1, w2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0x2b]
+; CHECK: adds w1, w2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0x2b]
+; CHECK: adds w1, w2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0x2b]
+; CHECK: adds w1, w2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0x2b]
+; CHECK: adds w1, w2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0x2b]
+; CHECK: adds w1, w2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0x2b]
+
+  adds x1, x2, w3, uxtb
+  adds x1, x2, w3, uxth
+  adds x1, x2, w3, uxtw
+  adds x1, x2, w3, uxtx
+  adds x1, x2, w3, sxtb
+  adds x1, x2, w3, sxth
+  adds x1, x2, w3, sxtw
+  adds x1, x2, w3, sxtx
+
+; CHECK: adds x1, x2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0xab]
+; CHECK: adds x1, x2, w3, uxth       ; encoding: [0x41,0x20,0x23,0xab]
+; CHECK: adds x1, x2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0xab]
+; CHECK: adds x1, x2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0xab]
+; CHECK: adds x1, x2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0xab]
+; CHECK: adds x1, x2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0xab]
+; CHECK: adds x1, x2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0xab]
+; CHECK: adds x1, x2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0xab]
+
+  adds w1, wsp, w3
+  adds w1, wsp, w3, uxtw #0
+  adds wzr, wsp, w3, lsl #4
+
+; CHECK: adds w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x2b]
+; CHECK: adds w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x2b]
+; CHECK: adds wzr, wsp, w3, lsl #4   ; encoding: [0xff,0x73,0x23,0x2b]
+
+  subs w1, w2, w3, uxtb
+  subs w1, w2, w3, uxth
+  subs w1, w2, w3, uxtw
+  subs w1, w2, w3, uxtx
+  subs w1, w2, w3, sxtb
+  subs w1, w2, w3, sxth
+  subs w1, w2, w3, sxtw
+  subs w1, w2, w3, sxtx
+
+; CHECK: subs w1, w2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0x6b]
+; CHECK: subs w1, w2, w3, uxth       ; encoding: [0x41,0x20,0x23,0x6b]
+; CHECK: subs w1, w2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0x6b]
+; CHECK: subs w1, w2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0x6b]
+; CHECK: subs w1, w2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0x6b]
+; CHECK: subs w1, w2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0x6b]
+; CHECK: subs w1, w2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0x6b]
+; CHECK: subs w1, w2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0x6b]
+
+  subs x1, x2, w3, uxtb
+  subs x1, x2, w3, uxth
+  subs x1, x2, w3, uxtw
+  subs x1, x2, w3, uxtx
+  subs x1, x2, w3, sxtb
+  subs x1, x2, w3, sxth
+  subs x1, x2, w3, sxtw
+  subs x1, x2, w3, sxtx
+
+; CHECK: subs x1, x2, w3, uxtb       ; encoding: [0x41,0x00,0x23,0xeb]
+; CHECK: subs x1, x2, w3, uxth       ; encoding: [0x41,0x20,0x23,0xeb]
+; CHECK: subs x1, x2, w3, uxtw       ; encoding: [0x41,0x40,0x23,0xeb]
+; CHECK: subs x1, x2, w3, uxtx       ; encoding: [0x41,0x60,0x23,0xeb]
+; CHECK: subs x1, x2, w3, sxtb       ; encoding: [0x41,0x80,0x23,0xeb]
+; CHECK: subs x1, x2, w3, sxth       ; encoding: [0x41,0xa0,0x23,0xeb]
+; CHECK: subs x1, x2, w3, sxtw       ; encoding: [0x41,0xc0,0x23,0xeb]
+; CHECK: subs x1, x2, w3, sxtx       ; encoding: [0x41,0xe0,0x23,0xeb]
+
+  subs w1, wsp, w3
+  subs w1, wsp, w3, uxtw #0
+
+; CHECK: subs w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x6b]
+; CHECK: subs w1, wsp, w3            ; encoding: [0xe1,0x43,0x23,0x6b]
+
+  cmp wsp, w9, lsl #0
+  subs x3, sp, x9, lsl #2
+  cmp wsp, w8, uxtw
+  subs wzr, wsp, w8, uxtw
+  cmp sp, w8, uxtw
+  subs xzr, sp, w8, uxtw
+
+; CHECK: cmp wsp, w9                 ; encoding: [0xff,0x63,0x29,0x6b]
+; CHECK: subs x3, sp, x9, lsl #2     ; encoding: [0xe3,0x6b,0x29,0xeb]
+; CHECK: cmp wsp, w8                 ; encoding: [0xff,0x43,0x28,0x6b]
+; CHECK: cmp wsp, w8                 ; encoding: [0xff,0x43,0x28,0x6b]
+; CHECK: cmp sp, w8                  ; encoding: [0xff,0x43,0x28,0xeb]
+; CHECK: cmp sp, w8                  ; encoding: [0xff,0x43,0x28,0xeb]
+
+  sub wsp, w9, w8, uxtw
+  sub w1, wsp, w8, uxtw
+  sub wsp, wsp, w8, uxtw
+  sub sp, x9, w8, uxtw
+  sub x1, sp, w8, uxtw
+  sub sp, sp, w8, uxtw
+  subs w1, wsp, w8, uxtw
+  subs x1, sp, w8, uxtw
+
+; CHECK: sub wsp, w9, w8             ; encoding: [0x3f,0x41,0x28,0x4b]
+; CHECK: sub w1, wsp, w8             ; encoding: [0xe1,0x43,0x28,0x4b]
+; CHECK: sub wsp, wsp, w8            ; encoding: [0xff,0x43,0x28,0x4b]
+; CHECK: sub sp, x9, w8              ; encoding: [0x3f,0x41,0x28,0xcb]
+; CHECK: sub x1, sp, w8              ; encoding: [0xe1,0x43,0x28,0xcb]
+; CHECK: sub sp, sp, w8              ; encoding: [0xff,0x43,0x28,0xcb]
+; CHECK: subs w1, wsp, w8            ; encoding: [0xe1,0x43,0x28,0x6b]
+; CHECK: subs x1, sp, w8             ; encoding: [0xe1,0x43,0x28,0xeb]
+
+;==---------------------------------------------------------------------------==
+; Signed/Unsigned divide
+;==---------------------------------------------------------------------------==
+
+  sdiv w1, w2, w3
+  sdiv x1, x2, x3
+  udiv w1, w2, w3
+  udiv x1, x2, x3
+
+; CHECK: sdiv w1, w2, w3             ; encoding: [0x41,0x0c,0xc3,0x1a]
+; CHECK: sdiv x1, x2, x3             ; encoding: [0x41,0x0c,0xc3,0x9a]
+; CHECK: udiv w1, w2, w3             ; encoding: [0x41,0x08,0xc3,0x1a]
+; CHECK: udiv x1, x2, x3             ; encoding: [0x41,0x08,0xc3,0x9a]
+
+;==---------------------------------------------------------------------------==
+; Variable shifts
+;==---------------------------------------------------------------------------==
+
+  asrv w1, w2, w3
+  asrv x1, x2, x3
+  asr w1, w2, w3
+  asr x1, x2, x3
+  lslv w1, w2, w3
+  lslv x1, x2, x3
+  lsl w1, w2, w3
+  lsl x1, x2, x3
+  lsrv w1, w2, w3
+  lsrv x1, x2, x3
+  lsr w1, w2, w3
+  lsr x1, x2, x3
+  rorv w1, w2, w3
+  rorv x1, x2, x3
+  ror w1, w2, w3
+  ror x1, x2, x3
+
+; CHECK: encoding: [0x41,0x28,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x28,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x28,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x28,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x20,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x20,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x20,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x20,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x24,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x24,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x24,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x24,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x2c,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x2c,0xc3,0x9a]
+; CHECK: encoding: [0x41,0x2c,0xc3,0x1a]
+; CHECK: encoding: [0x41,0x2c,0xc3,0x9a]
+
+;==---------------------------------------------------------------------------==
+; One operand instructions
+;==---------------------------------------------------------------------------==
+
+  cls w1, w2
+  cls x1, x2
+  clz w1, w2
+  clz x1, x2
+  rbit w1, w2
+  rbit x1, x2
+  rev w1, w2
+  rev x1, x2
+  rev16 w1, w2
+  rev16 x1, x2
+  rev32 x1, x2
+
+; CHECK: encoding: [0x41,0x14,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x14,0xc0,0xda]
+; CHECK: encoding: [0x41,0x10,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x10,0xc0,0xda]
+; CHECK: encoding: [0x41,0x00,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x00,0xc0,0xda]
+; CHECK: encoding: [0x41,0x08,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x0c,0xc0,0xda]
+; CHECK: encoding: [0x41,0x04,0xc0,0x5a]
+; CHECK: encoding: [0x41,0x04,0xc0,0xda]
+; CHECK: encoding: [0x41,0x08,0xc0,0xda]
+
+;==---------------------------------------------------------------------------==
+; 6.6.1 Multiply-add instructions
+;==---------------------------------------------------------------------------==
+
+  madd   w1, w2, w3, w4
+  madd   x1, x2, x3, x4
+  msub   w1, w2, w3, w4
+  msub   x1, x2, x3, x4
+  smaddl x1, w2, w3, x4
+  smsubl x1, w2, w3, x4
+  umaddl x1, w2, w3, x4
+  umsubl x1, w2, w3, x4
+
+; CHECK: madd   w1, w2, w3, w4       ; encoding: [0x41,0x10,0x03,0x1b]
+; CHECK: madd   x1, x2, x3, x4       ; encoding: [0x41,0x10,0x03,0x9b]
+; CHECK: msub   w1, w2, w3, w4       ; encoding: [0x41,0x90,0x03,0x1b]
+; CHECK: msub   x1, x2, x3, x4       ; encoding: [0x41,0x90,0x03,0x9b]
+; CHECK: smaddl x1, w2, w3, x4       ; encoding: [0x41,0x10,0x23,0x9b]
+; CHECK: smsubl x1, w2, w3, x4       ; encoding: [0x41,0x90,0x23,0x9b]
+; CHECK: umaddl x1, w2, w3, x4       ; encoding: [0x41,0x10,0xa3,0x9b]
+; CHECK: umsubl x1, w2, w3, x4       ; encoding: [0x41,0x90,0xa3,0x9b]
+
+;==---------------------------------------------------------------------------==
+; Multiply-high instructions
+;==---------------------------------------------------------------------------==
+
+  smulh x1, x2, x3
+  umulh x1, x2, x3
+
+; CHECK: smulh x1, x2, x3            ; encoding: [0x41,0x7c,0x43,0x9b]
+; CHECK: umulh x1, x2, x3            ; encoding: [0x41,0x7c,0xc3,0x9b]
+
+;==---------------------------------------------------------------------------==
+; Move immediate instructions
+;==---------------------------------------------------------------------------==
+
+  movz w0, #1
+  movz x0, #1
+  movz w0, #1, lsl #16
+  movz x0, #1, lsl #16
+
+; CHECK: movz w0, #1                 ; encoding: [0x20,0x00,0x80,0x52]
+; CHECK: movz x0, #1                 ; encoding: [0x20,0x00,0x80,0xd2]
+; CHECK: movz w0, #1, lsl #16        ; encoding: [0x20,0x00,0xa0,0x52]
+; CHECK: movz x0, #1, lsl #16        ; encoding: [0x20,0x00,0xa0,0xd2]
+
+  movn w0, #2
+  movn x0, #2
+  movn w0, #2, lsl #16
+  movn x0, #2, lsl #16
+
+; CHECK: movn w0, #2                 ; encoding: [0x40,0x00,0x80,0x12]
+; CHECK: movn x0, #2                 ; encoding: [0x40,0x00,0x80,0x92]
+; CHECK: movn w0, #2, lsl #16        ; encoding: [0x40,0x00,0xa0,0x12]
+; CHECK: movn x0, #2, lsl #16        ; encoding: [0x40,0x00,0xa0,0x92]
+
+  movk w0, #1
+  movk x0, #1
+  movk w0, #1, lsl #16
+  movk x0, #1, lsl #16
+
+; CHECK: movk w0, #1                 ; encoding: [0x20,0x00,0x80,0x72]
+; CHECK: movk x0, #1                 ; encoding: [0x20,0x00,0x80,0xf2]
+; CHECK: movk w0, #1, lsl #16        ; encoding: [0x20,0x00,0xa0,0x72]
+; CHECK: movk x0, #1, lsl #16        ; encoding: [0x20,0x00,0xa0,0xf2]
+
+;==---------------------------------------------------------------------------==
+; Conditionally set flags instructions
+;==---------------------------------------------------------------------------==
+
+  ccmn w1, #2, #3, eq
+  ccmn x1, #2, #3, eq
+  ccmp w1, #2, #3, eq
+  ccmp x1, #2, #3, eq
+
+; CHECK: encoding: [0x23,0x08,0x42,0x3a]
+; CHECK: encoding: [0x23,0x08,0x42,0xba]
+; CHECK: encoding: [0x23,0x08,0x42,0x7a]
+; CHECK: encoding: [0x23,0x08,0x42,0xfa]
+
+  ccmn w1, w2, #3, eq
+  ccmn x1, x2, #3, eq
+  ccmp w1, w2, #3, eq
+  ccmp x1, x2, #3, eq
+
+; CHECK: encoding: [0x23,0x00,0x42,0x3a]
+; CHECK: encoding: [0x23,0x00,0x42,0xba]
+; CHECK: encoding: [0x23,0x00,0x42,0x7a]
+; CHECK: encoding: [0x23,0x00,0x42,0xfa]
+
+;==---------------------------------------------------------------------------==
+; Conditional select instructions
+;==---------------------------------------------------------------------------==
+
+  csel w1, w2, w3, eq
+  csel x1, x2, x3, eq
+  csinc w1, w2, w3, eq
+  csinc x1, x2, x3, eq
+  csinv w1, w2, w3, eq
+  csinv x1, x2, x3, eq
+  csneg w1, w2, w3, eq
+  csneg x1, x2, x3, eq
+
+; CHECK: encoding: [0x41,0x00,0x83,0x1a]
+; CHECK: encoding: [0x41,0x00,0x83,0x9a]
+; CHECK: encoding: [0x41,0x04,0x83,0x1a]
+; CHECK: encoding: [0x41,0x04,0x83,0x9a]
+; CHECK: encoding: [0x41,0x00,0x83,0x5a]
+; CHECK: encoding: [0x41,0x00,0x83,0xda]
+; CHECK: encoding: [0x41,0x04,0x83,0x5a]
+; CHECK: encoding: [0x41,0x04,0x83,0xda]
+
+; Make sure we handle upper case, too. In particular, condition codes.
+  CSEL W16, W7, W27, EQ
+  CSEL W15, W6, W26, NE
+  CSEL W14, W5, W25, CS
+  CSEL W13, W4, W24, HS
+  csel w12, w3, w23, CC
+  csel w11, w2, w22, LO
+  csel w10, w1, w21, MI
+  csel x9, x9, x1, PL
+  csel x8, x8, x2, VS
+  CSEL X7, X7, X3, VC
+  CSEL X6, X7, X4, HI
+  CSEL X5, X6, X5, LS
+  CSEL X4, X5, X6, GE
+  csel x3, x4, x7, LT
+  csel x2, x3, x8, GT
+  csel x1, x2, x9, LE
+  csel x10, x1, x20, AL
+
+; CHECK: csel	w16, w7, w27, eq        ; encoding: [0xf0,0x00,0x9b,0x1a]
+; CHECK: csel	w15, w6, w26, ne        ; encoding: [0xcf,0x10,0x9a,0x1a]
+; CHECK: csel	w14, w5, w25, cs        ; encoding: [0xae,0x20,0x99,0x1a]
+; CHECK: csel	w13, w4, w24, cs        ; encoding: [0x8d,0x20,0x98,0x1a]
+; CHECK: csel	w12, w3, w23, cc        ; encoding: [0x6c,0x30,0x97,0x1a]
+; CHECK: csel	w11, w2, w22, cc        ; encoding: [0x4b,0x30,0x96,0x1a]
+; CHECK: csel	w10, w1, w21, mi        ; encoding: [0x2a,0x40,0x95,0x1a]
+; CHECK: csel	x9, x9, x1, pl          ; encoding: [0x29,0x51,0x81,0x9a]
+; CHECK: csel	x8, x8, x2, vs          ; encoding: [0x08,0x61,0x82,0x9a]
+; CHECK: csel	x7, x7, x3, vc          ; encoding: [0xe7,0x70,0x83,0x9a]
+; CHECK: csel	x6, x7, x4, hi          ; encoding: [0xe6,0x80,0x84,0x9a]
+; CHECK: csel	x5, x6, x5, ls          ; encoding: [0xc5,0x90,0x85,0x9a]
+; CHECK: csel	x4, x5, x6, ge          ; encoding: [0xa4,0xa0,0x86,0x9a]
+; CHECK: csel	x3, x4, x7, lt          ; encoding: [0x83,0xb0,0x87,0x9a]
+; CHECK: csel	x2, x3, x8, gt          ; encoding: [0x62,0xc0,0x88,0x9a]
+; CHECK: csel	x1, x2, x9, le          ; encoding: [0x41,0xd0,0x89,0x9a]
+; CHECK: csel	x10, x1, x20, al        ; encoding: [0x2a,0xe0,0x94,0x9a]
+
+
+;==---------------------------------------------------------------------------==
+; Scalar saturating arithmetic
+;==---------------------------------------------------------------------------==
+  uqxtn b4, h2
+  uqxtn h2, s3
+  uqxtn s9, d2
+
+; CHECK: uqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x7e]
+; CHECK: uqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x7e]
+; CHECK: uqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x7e]
diff --git a/test/MC/ARM64/arm64-fixup.s b/test/MC/ARM64/arm64-fixup.s
new file mode 100644
index 0000000000..eae6f68390
--- /dev/null
+++ b/test/MC/ARM64/arm64-fixup.s
@@ -0,0 +1,10 @@
+; RUN: llvm-mc < %s -triple arm64-apple-darwin --show-encoding | FileCheck %s
+
+foo:
+  adr x3, Lbar
+; CHECK: adr x3, Lbar            ; encoding: [0x03'A',A,A,0x10'A']
+; CHECK: fixup A - offset: 0, value: Lbar, kind: fixup_arm64_pcrel_adr_imm21
+Lbar:
+  adrp x3, _printf@page
+; CHECK: adrp x3, _printf@PAGE      ; encoding: [0x03'A',A,A,0x90'A']
+; CHECK: fixup A - offset: 0, value: _printf@PAGE, kind: fixup_arm64_pcrel_adrp_imm21
diff --git a/test/MC/ARM64/basic-a64-instructions.s b/test/MC/ARM64/basic-a64-instructions.s
new file mode 100644
index 0000000000..99b438d64b
--- /dev/null
+++ b/test/MC/ARM64/basic-a64-instructions.s
@@ -0,0 +1,18 @@
+// RUN: llvm-mc -triple arm64 -show-encoding < %s | FileCheck %s
+
+        crc32b  w5, w7, w20
+        crc32h  w28, wzr, w30
+        crc32w  w0, w1, w2
+        crc32x  w7, w9, x20
+        crc32cb w9, w5, w4
+        crc32ch w13, w17, w25
+        crc32cw wzr, w3, w5
+        crc32cx w18, w16, xzr
+// CHECK: crc32b   w5, w7, w20             // encoding: [0xe5,0x40,0xd4,0x1a]
+// CHECK: crc32h   w28, wzr, w30           // encoding: [0xfc,0x47,0xde,0x1a]
+// CHECK: crc32w   w0, w1, w2              // encoding: [0x20,0x48,0xc2,0x1a]
+// CHECK: crc32x   w7, w9, x20             // encoding: [0x27,0x4d,0xd4,0x9a]
+// CHECK: crc32cb  w9, w5, w4              // encoding: [0xa9,0x50,0xc4,0x1a]
+// CHECK: crc32ch  w13, w17, w25           // encoding: [0x2d,0x56,0xd9,0x1a]
+// CHECK: crc32cw  wzr, w3, w5             // encoding: [0x7f,0x58,0xc5,0x1a]
+// CHECK: crc32cx  w18, w16, xzr           // encoding: [0x12,0x5e,0xdf,0x9a]
diff --git a/test/MC/ARM64/bitfield-encoding.s b/test/MC/ARM64/bitfield-encoding.s
new file mode 100644
index 0000000000..cdbac0848a
--- /dev/null
+++ b/test/MC/ARM64/bitfield-encoding.s
@@ -0,0 +1,30 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+;==---------------------------------------------------------------------------==
+; 5.4.4 Bitfield Operations
+;==---------------------------------------------------------------------------==
+
+  bfm  w1, w2, #1, #15
+  bfm  x1, x2, #1, #15
+  sbfm w1, w2, #1, #15
+  sbfm x1, x2, #1, #15
+  ubfm w1, w2, #1, #15
+  ubfm x1, x2, #1, #15
+
+; CHECK: bfm  w1, w2, #1, #15        ; encoding: [0x41,0x3c,0x01,0x33]
+; CHECK: bfm  x1, x2, #1, #15        ; encoding: [0x41,0x3c,0x41,0xb3]
+; CHECK: sbfm w1, w2, #1, #15        ; encoding: [0x41,0x3c,0x01,0x13]
+; CHECK: sbfm x1, x2, #1, #15        ; encoding: [0x41,0x3c,0x41,0x93]
+; CHECK: ubfm w1, w2, #1, #15        ; encoding: [0x41,0x3c,0x01,0x53]
+; CHECK: ubfm x1, x2, #1, #15        ; encoding: [0x41,0x3c,0x41,0xd3]
+
+;==---------------------------------------------------------------------------==
+; 5.4.5 Extract (immediate)
+;==---------------------------------------------------------------------------==
+
+  extr w1, w2, w3, #15
+  extr x2, x3, x4, #1
+
+; CHECK: extr w1, w2, w3, #15        ; encoding: [0x41,0x3c,0x83,0x13]
+; CHECK: extr x2, x3, x4, #1         ; encoding: [0x62,0x04,0xc4,0x93]
diff --git a/test/MC/ARM64/branch-encoding.s b/test/MC/ARM64/branch-encoding.s
new file mode 100644
index 0000000000..7857feaa61
--- /dev/null
+++ b/test/MC/ARM64/branch-encoding.s
@@ -0,0 +1,159 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+
+;-----------------------------------------------------------------------------
+; Unconditional branch (register) instructions.
+;-----------------------------------------------------------------------------
+
+  ret
+; CHECK: encoding: [0xc0,0x03,0x5f,0xd6]
+  ret x1
+; CHECK: encoding: [0x20,0x00,0x5f,0xd6]
+  drps
+; CHECK: encoding: [0xe0,0x03,0xbf,0xd6]
+  eret
+; CHECK: encoding: [0xe0,0x03,0x9f,0xd6]
+  br  x5
+; CHECK: encoding: [0xa0,0x00,0x1f,0xd6]
+  blr x9
+; CHECK: encoding: [0x20,0x01,0x3f,0xd6]
+  bl  L1
+; CHECK: bl L1   ; encoding: [A,A,A,0b100101AA]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_call26
+
+;-----------------------------------------------------------------------------
+; Contitional branch instructions.
+;-----------------------------------------------------------------------------
+
+  b     L1
+; CHECK: b L1      ; encoding: [A,A,A,0b000101AA]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_branch26
+  b.eq  L1
+; CHECK: b.eq L1   ; encoding: [0bAAA00000,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.ne  L1
+; CHECK: b.ne L1   ; encoding: [0bAAA00001,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.cs  L1
+; CHECK: b.cs L1   ; encoding: [0bAAA00010,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.cc  L1
+; CHECK: b.cc L1   ; encoding: [0bAAA00011,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.mi  L1
+; CHECK: b.mi L1   ; encoding: [0bAAA00100,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.pl  L1
+; CHECK: b.pl L1   ; encoding: [0bAAA00101,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.vs  L1
+; CHECK: b.vs L1   ; encoding: [0bAAA00110,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.vc  L1
+; CHECK: b.vc L1   ; encoding: [0bAAA00111,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.hi  L1
+; CHECK: b.hi L1   ; encoding: [0bAAA01000,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.ls  L1
+; CHECK: b.ls L1   ; encoding: [0bAAA01001,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.ge  L1
+; CHECK: b.ge L1   ; encoding: [0bAAA01010,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.lt  L1
+; CHECK: b.lt L1   ; encoding: [0bAAA01011,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.gt  L1
+; CHECK: b.gt L1   ; encoding: [0bAAA01100,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.le  L1
+; CHECK: b.le L1   ; encoding: [0bAAA01101,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+  b.al  L1
+; CHECK: b L1      ; encoding: [0bAAA01110,A,A,0x54]
+; CHECK: fixup A - offset: 0, value: L1, kind: fixup_arm64_pcrel_imm19
+L1:
+  b #28
+; CHECK: b #28
+  b.lt #28
+; CHECK: b.lt #28
+  b.cc #1048572
+; CHECK: b.cc	#1048572                ; encoding: [0xe3,0xff,0x7f,0x54]
+  b #134217724
+; CHECK: b	#134217724              ; encoding: [0xff,0xff,0xff,0x15]
+  b #-134217728
+; CHECK: b	#-134217728             ; encoding: [0x00,0x00,0x00,0x16]
+
+;-----------------------------------------------------------------------------
+; Compare-and-branch instructions.
+;-----------------------------------------------------------------------------
+
+  cbz w1, foo
+; CHECK: encoding: [0bAAA00001,A,A,0x34]
+  cbz x1, foo
+; CHECK: encoding: [0bAAA00001,A,A,0xb4]
+  cbnz w2, foo
+; CHECK: encoding: [0bAAA00010,A,A,0x35]
+  cbnz x2, foo
+; CHECK: encoding: [0bAAA00010,A,A,0xb5]
+  cbz w1, #28
+; CHECK: cbz w1, #28
+  cbz     w20, #1048572
+; CHECK: cbz	w20, #1048572           ; encoding: [0xf4,0xff,0x7f,0x34]
+  cbnz x2, #-1048576
+; CHECK: cbnz	x2, #-1048576           ; encoding: [0x02,0x00,0x80,0xb5]
+
+
+;-----------------------------------------------------------------------------
+; Bit-test-and-branch instructions.
+;-----------------------------------------------------------------------------
+
+  tbz x1, #3, foo
+; CHECK: encoding: [0bAAA00001,A,0b00011AAA,0x36]
+  tbnz x1, #63, foo
+; CHECK: encoding: [0bAAA00001,A,0b11111AAA,0xb7]
+
+  tbz w1, #3, foo
+; CHECK: encoding: [0bAAA00001,A,0b00011AAA,0x36]
+  tbnz w1, #31, foo
+; CHECK: encoding: [0bAAA00001,A,0b11111AAA,0x37]
+
+  tbz w1, #3, #28
+; CHECK: tbz w1, #3, #28
+  tbz w3, #5, #32764
+; CHECK: tbz	w3, #5, #32764          ; encoding: [0xe3,0xff,0x2b,0x36]
+  tbnz x3, #8, #-32768
+; CHECK: tbnz	w3, #8, #-32768         ; encoding: [0x03,0x00,0x44,0x37]
+
+;-----------------------------------------------------------------------------
+; Exception generation instructions.
+;-----------------------------------------------------------------------------
+
+  brk   #1
+; CHECK: encoding: [0x20,0x00,0x20,0xd4]
+  dcps1 #2
+; CHECK: encoding: [0x41,0x00,0xa0,0xd4]
+  dcps2 #3
+; CHECK: encoding: [0x62,0x00,0xa0,0xd4]
+  dcps3 #4
+; CHECK: encoding: [0x83,0x00,0xa0,0xd4]
+  hlt   #5
+; CHECK: encoding: [0xa0,0x00,0x40,0xd4]
+  hvc   #6
+; CHECK: encoding: [0xc2,0x00,0x00,0xd4]
+  smc   #7
+; CHECK: encoding: [0xe3,0x00,0x00,0xd4]
+  svc   #8
+; CHECK: encoding: [0x01,0x01,0x00,0xd4]
+
+; The immediate defaults to zero for DCPSn
+  dcps1
+  dcps2
+  dcps3
+
+; CHECK: dcps1                     ; encoding: [0x01,0x00,0xa0,0xd4]
+; CHECK: dcps2                     ; encoding: [0x02,0x00,0xa0,0xd4]
+; CHECK: dcps3                     ; encoding: [0x03,0x00,0xa0,0xd4]
+
diff --git a/test/MC/ARM64/crypto.s b/test/MC/ARM64/crypto.s
new file mode 100644
index 0000000000..d7c4ec3df4
--- /dev/null
+++ b/test/MC/ARM64/crypto.s
@@ -0,0 +1,66 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding -output-asm-variant=1 < %s | FileCheck %s
+
+foo:
+  aese.16b v0, v1
+  aesd.16b v0, v1
+  aesmc.16b v0, v1
+  aesimc.16b v0, v1
+
+  sha1c.4s q0, s1, v2
+  sha1p.4s q0, s1, v2
+  sha1m.4s q0, s1, v2
+  sha1su0.4s v0, v1, v2
+  sha256h.4s q0, q1, v2
+  sha256h2.4s q0, q1, v2
+  sha256su1.4s v0, v1, v2
+  sha1h s0, s1
+  sha1su1.4s v0, v1
+  sha256su0.4s v0, v1
+
+; CHECK: aese.16b v0, v1               ; encoding: [0x20,0x48,0x28,0x4e]
+; CHECK: aesd.16b v0, v1               ; encoding: [0x20,0x58,0x28,0x4e]
+; CHECK: aesmc.16b v0, v1              ; encoding: [0x20,0x68,0x28,0x4e]
+; CHECK: aesimc.16b v0, v1             ; encoding: [0x20,0x78,0x28,0x4e]
+
+; CHECK: sha1c.4s q0, s1, v2           ; encoding: [0x20,0x00,0x02,0x5e]
+; CHECK: sha1p.4s q0, s1, v2           ; encoding: [0x20,0x10,0x02,0x5e]
+; CHECK: sha1m.4s q0, s1, v2           ; encoding: [0x20,0x20,0x02,0x5e]
+; CHECK: sha1su0.4s v0, v1, v2         ; encoding: [0x20,0x30,0x02,0x5e]
+; CHECK: sha256h.4s q0, q1, v2         ; encoding: [0x20,0x40,0x02,0x5e]
+; CHECK: sha256h2.4s q0, q1, v2        ; encoding: [0x20,0x50,0x02,0x5e]
+; CHECK: sha256su1.4s v0, v1, v2       ; encoding: [0x20,0x60,0x02,0x5e]
+; CHECK: sha1h s0, s1                  ; encoding: [0x20,0x08,0x28,0x5e]
+; CHECK: sha1su1.4s v0, v1             ; encoding: [0x20,0x18,0x28,0x5e]
+; CHECK: sha256su0.4s v0, v1           ; encoding: [0x20,0x28,0x28,0x5e]
+
+  aese v2.16b, v3.16b
+  aesd v5.16b, v7.16b
+  aesmc v11.16b, v13.16b
+  aesimc v17.16b, v19.16b
+
+; CHECK: aese.16b v2, v3            ; encoding: [0x62,0x48,0x28,0x4e]
+; CHECK: aesd.16b v5, v7            ; encoding: [0xe5,0x58,0x28,0x4e]
+; CHECK: aesmc.16b v11, v13         ; encoding: [0xab,0x69,0x28,0x4e]
+; CHECK: aesimc.16b v17, v19        ; encoding: [0x71,0x7a,0x28,0x4e]
+
+  sha1c q23, s29, v3.4s
+  sha1p q14, s15, v9.4s
+  sha1m q2, s6, v5.4s
+  sha1su0 v3.4s, v5.4s, v9.4s
+  sha256h q2, q7, v18.4s
+  sha256h2 q28, q18, v28.4s
+  sha256su1 v4.4s, v5.4s, v9.4s
+  sha1h s30, s0
+  sha1su1 v10.4s, v21.4s
+  sha256su0 v2.4s, v31.4s
+
+; CHECK: sha1c.4s q23, s29, v3       ; encoding: [0xb7,0x03,0x03,0x5e]
+; CHECK: sha1p.4s q14, s15, v9       ; encoding: [0xee,0x11,0x09,0x5e]
+; CHECK: sha1m.4s q2, s6, v5         ; encoding: [0xc2,0x20,0x05,0x5e]
+; CHECK: sha1su0.4s v3, v5, v9       ; encoding: [0xa3,0x30,0x09,0x5e]
+; CHECK: sha256h.4s q2, q7, v18      ; encoding: [0xe2,0x40,0x12,0x5e]
+; CHECK: sha256h2.4s q28, q18, v28   ; encoding: [0x5c,0x52,0x1c,0x5e]
+; CHECK: sha256su1.4s v4, v5, v9     ; encoding: [0xa4,0x60,0x09,0x5e]
+; CHECK: sha1h s30, s0               ; encoding: [0x1e,0x08,0x28,0x5e]
+; CHECK: sha1su1.4s v10, v21         ; encoding: [0xaa,0x1a,0x28,0x5e]
+; CHECK: sha256su0.4s v2, v31        ; encoding: [0xe2,0x2b,0x28,0x5e]
diff --git a/test/MC/ARM64/diags.s b/test/MC/ARM64/diags.s
new file mode 100644
index 0000000000..d857fe124c
--- /dev/null
+++ b/test/MC/ARM64/diags.s
@@ -0,0 +1,242 @@
+; RUN: not llvm-mc -triple arm64-apple-darwin -show-encoding < %s 2> %t | FileCheck %s
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+foo:
+
+; The first should encode as an expression. The second should error expecting
+; a register.
+  ldr x3, (foo + 4)
+  ldr x3, [foo + 4]
+; CHECK:  ldr x3, foo+4               ; encoding: [0bAAA00011,A,A,0x58]
+; CHECK:                              ;   fixup A - offset: 0, value: foo+4, kind: fixup_arm64_pcrel_imm19
+; CHECK-ERRORS: error: register expected
+
+; The last argument should be flagged as an error.  rdar://9576009
+  ld4.8b	{v0, v1, v2, v3}, [x0], #33
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: ld4.8b	{v0, v1, v2, v3}, [x0], #33
+
+
+        ldr x0, [x0, #804]
+        ldr w0, [x0, #802]
+        ldr x0, [x0, #804]!
+        ldr w0, [w0, #301]!
+        ldr x0, [x0], #804
+        ldr w0, [w0], #301
+
+        ldp w3, w4, [x5, #11]!
+        ldp x3, x4, [x5, #12]!
+        ldp q3, q4, [x5, #12]!
+        ldp w3, w4, [x5], #11
+        ldp x3, x4, [x5], #12
+        ldp q3, q4, [x5], #12
+
+        ldur x0, [x1, #-257]
+
+; CHECK-ERRORS: error: index must be a multiple of 8 in range [0,32760].
+; CHECK-ERRORS:         ldr x0, [x0, #804]
+; CHECK-ERRORS:                 ^
+; CHECK-ERRORS: error: index must be a multiple of 4 in range [0,16380].
+; CHECK-ERRORS:         ldr w0, [x0, #802]
+; CHECK-ERRORS:                 ^
+; CHECK-ERRORS: error: index must be an integer in range [-256,255].
+; CHECK-ERRORS:         ldr x0, [x0, #804]!
+; CHECK-ERRORS:                 ^
+; CHECK-ERRORS: error: index must be an integer in range [-256,255].
+; CHECK-ERRORS:         ldr w0, [w0, #301]!
+; CHECK-ERRORS:                 ^
+; CHECK-ERRORS: error: index must be an integer in range [-256,255].
+; CHECK-ERRORS:         ldr x0, [x0], #804
+; CHECK-ERRORS:                       ^
+; CHECK-ERRORS: error: index must be an integer in range [-256,255].
+; CHECK-ERRORS:         ldr w0, [w0], #301
+; CHECK-ERRORS:                       ^
+; CHECK-ERRORS: error: index must be a multiple of 4 in range [-256,252].
+; CHECK-ERRORS:         ldp w3, w4, [x5, #11]!
+; CHECK-ERRORS:                     ^
+; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512,504].
+; CHECK-ERRORS:         ldp x3, x4, [x5, #12]!
+; CHECK-ERRORS:                     ^
+; CHECK-ERRORS: error: index must be a multiple of 16 in range [-1024,1008].
+; CHECK-ERRORS:         ldp q3, q4, [x5, #12]!
+; CHECK-ERRORS:                     ^
+; CHECK-ERRORS: error: index must be a multiple of 4 in range [-256,252].
+; CHECK-ERRORS:         ldp w3, w4, [x5], #11
+; CHECK-ERRORS:                           ^
+; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512,504].
+; CHECK-ERRORS:         ldp x3, x4, [x5], #12
+; CHECK-ERRORS:                           ^
+; CHECK-ERRORS: error: index must be a multiple of 8 in range [-512,504].
+; CHECK-ERRORS:         ldp q3, q4, [x5], #12
+; CHECK-ERRORS:                           ^
+; CHECK-ERRORS: error: index must be an integer in range [-256,255].
+; CHECK-ERRORS:         ldur x0, [x1, #-257]
+; CHECK-ERRORS:                   ^
+
+
+
+; Shift immediates range checking.
+  sqrshrn b4, h9, #10
+  rshrn v9.8b, v11.8h, #17
+  sqrshrn v7.4h, v8.4s, #39
+  uqshrn2 v4.4s, v5.2d, #67
+
+; CHECK-ERRORS: error: immediate must be an integer in range [1,8].
+; CHECK-ERRORS:   sqrshrn b4, h9, #10
+; CHECK-ERRORS:                   ^
+; CHECK-ERRORS: error: immediate must be an integer in range [1,8].
+; CHECK-ERRORS:   rshrn v9.8b, v11.8h, #17
+; CHECK-ERRORS:                        ^
+; CHECK-ERRORS: error: immediate must be an integer in range [1,16].
+; CHECK-ERRORS:   sqrshrn v7.4h, v8.4s, #39
+; CHECK-ERRORS:                         ^
+; CHECK-ERRORS: error: immediate must be an integer in range [1,32].
+; CHECK-ERRORS:   uqshrn2 v4.4s, v5.2d, #67
+; CHECK-ERRORS:                         ^
+
+
+  st1.s4 {v14, v15}, [x2], #32
+; CHECK-ERRORS: error: invalid type suffix for instruction
+; CHECK-ERRORS: st1.s4 {v14, v15}, [x2], #32
+; CHECK-ERRORS:     ^
+
+
+
+; Load pair instructions where Rt==Rt2 and writeback load/store instructions
+; where Rt==Rn or Rt2==Rn are unpredicatable.
+  ldp x1, x2, [x2], #16
+  ldp x2, x2, [x2], #16
+  ldp w1, w2, [x2], #16
+  ldp w2, w2, [x2], #16
+  ldp x1, x1, [x2]
+
+  ldr x2, [x2], #8
+  ldr x2, [x2, #8]!
+  ldr w2, [x2], #8
+  ldr w2, [x2, #8]!
+
+  str x2, [x2], #8
+  str x2, [x2, #8]!
+  str w2, [x2], #8
+  str w2, [x2, #8]!
+
+; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
+; CHECK-ERRORS:   ldp x1, x2, [x2], #16
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
+; CHECK-ERRORS:   ldp x2, x2, [x2], #16
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
+; CHECK-ERRORS:   ldp w1, w2, [x2], #16
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, writeback base is also a destination
+; CHECK-ERRORS:   ldp w2, w2, [x2], #16
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDP instruction, Rt2==Rt
+; CHECK-ERRORS:   ldp x1, x1, [x2]
+; CHECK-ERRORS:           ^
+; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
+; CHECK-ERRORS:   ldr x2, [x2], #8
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
+; CHECK-ERRORS:   ldr x2, [x2, #8]!
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
+; CHECK-ERRORS:   ldr w2, [x2], #8
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable LDR instruction, writeback base is also a source
+; CHECK-ERRORS:   ldr w2, [x2, #8]!
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
+; CHECK-ERRORS:   str x2, [x2], #8
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
+; CHECK-ERRORS:   str x2, [x2, #8]!
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
+; CHECK-ERRORS:   str w2, [x2], #8
+; CHECK-ERRORS:       ^
+; CHECK-ERRORS: error: unpredictable STR instruction, writeback base is also a source
+; CHECK-ERRORS:   str w2, [x2, #8]!
+; CHECK-ERRORS:       ^
+
+; The validity checking for shifted-immediate operands.  rdar://13174476
+; Where the immediate is out of range.
+  add w1, w2, w3, lsr #75
+
+; CHECK-ERRORS: error: immediate value too large for shifter operand
+; CHECK-ERRORS: add w1, w2, w3, lsr #75
+; CHECK-ERRORS:                      ^
+
+; logical instructions on 32-bit regs with shift > 31 is not legal
+orr w0, w0, w0, lsl #32
+; CHECK-ERRORS: error: shift value out of range
+; CHECK-ERRORS:        orr w0, w0, w0, lsl #32
+; CHECK-ERRORS:                        ^
+eor w0, w0, w0, lsl #32
+; CHECK-ERRORS: error: shift value out of range
+; CHECK-ERRORS:        eor w0, w0, w0, lsl #32
+; CHECK-ERRORS:                        ^
+and w0, w0, w0, lsl #32
+; CHECK-ERRORS: error: shift value out of range
+; CHECK-ERRORS:        and w0, w0, w0, lsl #32
+; CHECK-ERRORS:                        ^
+ands w0, w0, w0, lsl #32
+; CHECK-ERRORS: error: shift value out of range
+; CHECK-ERRORS:        ands w0, w0, w0, lsl #32
+; CHECK-ERRORS:                        ^
+
+; Relocated expressions should not be accepted for 32-bit adds or sub (imm)
+add w3, w5, sym@PAGEOFF
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: add w3, w5, sym@PAGEOFF
+; CHECK-ERRORS:             ^
+
+adds w3, w5, sym@PAGEOFF
+adds x9, x12, sym@PAGEOFF
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: adds w3, w5, sym@PAGEOFF
+; CHECK-ERRORS:              ^
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: adds x9, x12, sym@PAGEOFF
+; CHECK-ERRORS:               ^
+
+sub x3, x5, sym@PAGEOFF
+sub w20, w30, sym@PAGEOFF
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: sub x3, x5, sym@PAGEOFF
+; CHECK-ERRORS:             ^
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: sub w20, w30, sym@PAGEOFF
+; CHECK-ERRORS:               ^
+
+subs w9, w10, sym@PAGEOFF
+subs x20, x30, sym@PAGEOFF
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: subs w9, w10, sym@PAGEOFF
+; CHECK-ERRORS:               ^
+; CHECK-ERRORS: error: invalid immediate expression
+; CHECK-ERRORS: subs x20, x30, sym@PAGEOFF
+; CHECK-ERRORS:                ^
+
+tbl v0.8b, { v1 }, v0.8b
+tbl v0.16b, { v1.8b, v2.8b, v3.8b }, v0.16b
+tbx v3.16b, { v12.8b, v13.8b, v14.8b }, v6.8b
+tbx v2.8b, { v0 }, v6.8b
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: tbl v0.8b, { v1 }, v0.8b
+; CHECK-ERRORS:            ^
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: tbl v0.16b, { v1.8b, v2.8b, v3.8b }, v0.16b
+; CHECK-ERRORS:             ^
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: tbx v3.16b, { v12.8b, v13.8b, v14.8b }, v6.8b
+; CHECK-ERRORS:             ^
+; CHECK-ERRORS: error: invalid operand for instruction
+; CHECK-ERRORS: tbx v2.8b, { v0 }, v6.8b
+; CHECK-ERRORS:            ^
+
+b.c #0x4
+; CHECK-ERRORS: error: invalid condition code
+; CHECK-ERRORS: b.c #0x4
+; CHECK-ERRORS:   ^
diff --git a/test/MC/ARM64/directive_loh.s b/test/MC/ARM64/directive_loh.s
new file mode 100644
index 0000000000..76d2d7f218
--- /dev/null
+++ b/test/MC/ARM64/directive_loh.s
@@ -0,0 +1,93 @@
+# RUN: not llvm-mc -triple arm64-apple-darwin < %s 2> %t | FileCheck %s
+# RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+.globl _fct1
+_fct1:
+  L1:
+  L2:
+  L3:
+  L4:
+  ret lr;
+
+# Known LOHs with:
+# - Regular syntax.
+# - Alternative syntax.
+
+# CHECK: .loh AdrpAdrp L1, L2
+# CHECK: .loh AdrpAdrp L1, L2
+.loh AdrpAdrp L1, L2
+.loh 1 L1, L2
+
+# CHECK: .loh AdrpLdr L1, L2
+# CHECK: .loh AdrpLdr L1, L2
+.loh AdrpLdr L1, L2
+.loh 2 L1, L2
+
+# CHECK: .loh AdrpAddLdr L1, L2, L3
+# CHECK: .loh AdrpAddLdr L1, L2, L3
+.loh AdrpAddLdr L1, L2, L3
+.loh 3 L1, L2, L3
+
+# CHECK: .loh AdrpLdrGotLdr L1, L2, L3
+# CHECK: .loh AdrpLdrGotLdr L1, L2, L3
+.loh AdrpLdrGotLdr L1, L2, L3
+.loh 4 L1, L2, L3
+
+# CHECK: .loh AdrpAddStr L1, L2, L3
+# CHECK: .loh AdrpAddStr L1, L2, L3
+.loh AdrpAddStr L1, L2, L3
+.loh 5 L1, L2, L3
+
+# CHECK: .loh AdrpLdrGotStr L1, L2, L3
+# CHECK: .loh AdrpLdrGotStr L1, L2, L3
+.loh AdrpLdrGotStr L1, L2, L3
+.loh 6 L1, L2, L3
+
+# CHECK: .loh AdrpAdd L1, L2
+# CHECK: .loh AdrpAdd L1, L2
+.loh AdrpAdd L1, L2
+.loh 7 L1, L2
+
+# CHECK: .loh AdrpLdrGot L1, L2
+# CHECK: .loh AdrpLdrGot L1, L2
+.loh AdrpLdrGot L1, L2
+.loh 8 L1, L2
+
+# End Known LOHs.
+
+### Errors Check ####
+
+# Unknown textual identifier.
+# CHECK-ERRORS: error: invalid identifier in directive
+# CHECK-ERRORS-NEXT: .loh Unknown
+# CHECK-ERRORS-NEXT:      ^
+.loh Unknown
+# Unknown numeric identifier.
+# CHECK-ERRORS: error: invalid numeric identifier in directive
+# CHECK-ERRORS-NEXT: .loh 153, L1
+# CHECK-ERRORS-NEXT:      ^
+.loh 153, L1
+
+# Too much arguments.
+# CHECK-ERRORS: error: unexpected token in '.loh' directive
+# CHECK-ERRORS-NEXT: .loh AdrpAdrp L1, L2, L3
+# CHECK-ERRORS-NEXT:                     ^
+.loh AdrpAdrp L1, L2, L3
+
+# Too much arguments with alternative syntax.
+# CHECK-ERRORS: error: unexpected token in '.loh' directive
+# CHECK-ERRORS-NEXT: .loh 1 L1, L2, L3
+# CHECK-ERRORS-NEXT:              ^
+.loh 1 L1, L2, L3
+
+# Too few argumets.
+# CHECK-ERRORS: error: unexpected token in '.loh' directive
+# CHECK-ERRORS-NEXT: .loh AdrpAdrp L1
+# CHECK-ERRORS-NEXT:                 ^
+.loh AdrpAdrp L1
+
+# Too few argumets with alternative syntax.
+# CHECK-ERRORS: error: unexpected token in '.loh' directive
+# CHECK-ERRORS-NEXT: .loh 1 L1
+# CHECK-ERRORS-NEXT:          ^
+.loh 1 L1
diff --git a/test/MC/ARM64/elf-relocs.s b/test/MC/ARM64/elf-relocs.s
new file mode 100644
index 0000000000..31446ff969
--- /dev/null
+++ b/test/MC/ARM64/elf-relocs.s
@@ -0,0 +1,249 @@
+// RUN: llvm-mc -triple=arm64-linux-gnu -o - < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64-linux-gnu -filetype=obj < %s | llvm-objdump -triple=arm64-linux-gnu - -r | FileCheck %s --check-prefix=CHECK-OBJ
+
+   add x0, x2, #:lo12:sym
+// CHECK: add x0, x2, :lo12:sym
+// CHECK-OBJ: 0 R_AARCH64_ADD_ABS_LO12_NC sym
+
+   add x5, x7, #:dtprel_lo12:sym
+// CHECK: add x5, x7, :dtprel_lo12:sym
+// CHECK-OBJ: 4 R_AARCH64_TLSLD_ADD_DTPREL_LO12 sym
+
+   add x9, x12, #:dtprel_lo12_nc:sym
+// CHECK: add x9, x12, :dtprel_lo12_nc:sym
+// CHECK-OBJ: 8 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC sym
+
+   add x20, x30, #:tprel_lo12:sym
+// CHECK: add x20, lr, :tprel_lo12:sym
+// CHECK-OBJ: c R_AARCH64_TLSLE_ADD_TPREL_LO12 sym
+
+   add x9, x12, #:tprel_lo12_nc:sym
+// CHECK: add x9, x12, :tprel_lo12_nc:sym
+// CHECK-OBJ: 10 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC sym
+
+   add x5, x0, #:tlsdesc_lo12:sym
+// CHECK: add x5, x0, :tlsdesc_lo12:sym
+// CHECK-OBJ: 14 R_AARCH64_TLSDESC_ADD_LO12_NC sym
+
+        add x0, x2, #:lo12:sym+8
+// CHECK: add x0, x2, :lo12:sym
+// CHECK-OBJ: 18 R_AARCH64_ADD_ABS_LO12_NC sym+8
+
+   add x5, x7, #:dtprel_lo12:sym+1
+// CHECK: add x5, x7, :dtprel_lo12:sym+1
+// CHECK-OBJ: 1c R_AARCH64_TLSLD_ADD_DTPREL_LO12 sym+1
+
+   add x9, x12, #:dtprel_lo12_nc:sym+2
+// CHECK: add x9, x12, :dtprel_lo12_nc:sym+2
+// CHECK-OBJ:20 R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC sym+2
+
+   add x20, x30, #:tprel_lo12:sym+12
+// CHECK: add x20, lr, :tprel_lo12:sym+12
+// CHECK-OBJ: 24 R_AARCH64_TLSLE_ADD_TPREL_LO12 sym+12
+
+   add x9, x12, #:tprel_lo12_nc:sym+54
+// CHECK: add x9, x12, :tprel_lo12_nc:sym+54
+// CHECK-OBJ: 28 R_AARCH64_TLSLE_ADD_TPREL_LO12_NC sym+54
+
+   add x5, x0, #:tlsdesc_lo12:sym+70
+// CHECK: add x5, x0, :tlsdesc_lo12:sym+70
+// CHECK-OBJ: 2c R_AARCH64_TLSDESC_ADD_LO12_NC sym+70
+
+        .hword sym + 4 - .
+// CHECK-OBJ: 30 R_AARCH64_PREL16 sym+4
+        .word sym - . + 8
+// CHECK-OBJ: 32 R_AARCH64_PREL32 sym+8
+        .xword sym-.
+// CHECK-OBJ: 36 R_AARCH64_PREL64 sym{{$}}
+
+        .hword sym
+// CHECK-OBJ: 3e R_AARCH64_ABS16 sym
+        .word sym+1
+// CHECK-OBJ: 40 R_AARCH64_ABS32 sym+1
+        .xword sym+16
+// CHECK-OBJ: 44 R_AARCH64_ABS64 sym+16
+
+   adrp x0, sym
+// CHECK: adrp x0, sym
+// CHECK-OBJ: 4c R_AARCH64_ADR_PREL_PG_HI21 sym
+
+   adrp x15, :got:sym
+// CHECK: adrp x15, :got:sym
+// CHECK-OBJ: 50 R_AARCH64_ADR_GOT_PAGE sym
+
+   adrp x29, :gottprel:sym
+// CHECK: adrp fp, :gottprel:sym
+// CHECK-OBJ: 54 R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 sym
+
+   adrp x2, :tlsdesc:sym
+// CHECK: adrp x2, :tlsdesc:sym
+// CHECK-OBJ: 58 R_AARCH64_TLSDESC_ADR_PAGE sym
+
+   // LLVM is not competent enough to do this relocation because the
+   // page boundary could occur anywhere after linking. A relocation
+   // is needed.
+   adrp x3, trickQuestion
+   .global trickQuestion
+trickQuestion:
+// CHECK: adrp x3, trickQuestion
+// CHECK-OBJ: 5c R_AARCH64_ADR_PREL_PG_HI21 trickQuestion
+
+   ldrb w2, [x3, #:lo12:sym]
+   ldrsb w5, [x7, #:lo12:sym]
+   ldrsb x11, [x13, #:lo12:sym]
+   ldr b17, [x19, #:lo12:sym]
+// CHECK: ldrb w2, [x3, :lo12:sym]
+// CHECK: ldrsb w5, [x7, :lo12:sym]
+// CHECK: ldrsb x11, [x13, :lo12:sym]
+// CHECK: ldr b17, [x19, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST8_ABS_LO12_NC sym
+
+   ldrb w23, [x29, #:dtprel_lo12_nc:sym]
+   ldrsb w23, [x19, #:dtprel_lo12:sym]
+   ldrsb x17, [x13, #:dtprel_lo12_nc:sym]
+   ldr b11, [x7, #:dtprel_lo12:sym]
+// CHECK: ldrb w23, [fp, :dtprel_lo12_nc:sym]
+// CHECK: ldrsb w23, [x19, :dtprel_lo12:sym]
+// CHECK: ldrsb x17, [x13, :dtprel_lo12_nc:sym]
+// CHECK: ldr b11, [x7, :dtprel_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST8_DTPREL_LO12 sym
+
+   ldrb w1, [x2, #:tprel_lo12:sym]
+   ldrsb w3, [x4, #:tprel_lo12_nc:sym]
+   ldrsb x5, [x6, #:tprel_lo12:sym]
+   ldr b7, [x8, #:tprel_lo12_nc:sym]
+// CHECK: ldrb w1, [x2, :tprel_lo12:sym]
+// CHECK: ldrsb w3, [x4, :tprel_lo12_nc:sym]
+// CHECK: ldrsb x5, [x6, :tprel_lo12:sym]
+// CHECK: ldr b7, [x8, :tprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC sym
+
+   ldrh w2, [x3, #:lo12:sym]
+   ldrsh w5, [x7, #:lo12:sym]
+   ldrsh x11, [x13, #:lo12:sym]
+   ldr h17, [x19, #:lo12:sym]
+// CHECK: ldrh w2, [x3, :lo12:sym]
+// CHECK: ldrsh w5, [x7, :lo12:sym]
+// CHECK: ldrsh x11, [x13, :lo12:sym]
+// CHECK: ldr h17, [x19, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST16_ABS_LO12_NC sym
+
+   ldrh w23, [x29, #:dtprel_lo12_nc:sym]
+   ldrsh w23, [x19, #:dtprel_lo12:sym]
+   ldrsh x17, [x13, #:dtprel_lo12_nc:sym]
+   ldr h11, [x7, #:dtprel_lo12:sym]
+// CHECK: ldrh w23, [fp, :dtprel_lo12_nc:sym]
+// CHECK: ldrsh w23, [x19, :dtprel_lo12:sym]
+// CHECK: ldrsh x17, [x13, :dtprel_lo12_nc:sym]
+// CHECK: ldr h11, [x7, :dtprel_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST16_DTPREL_LO12 sym
+
+   ldrh w1, [x2, #:tprel_lo12:sym]
+   ldrsh w3, [x4, #:tprel_lo12_nc:sym]
+   ldrsh x5, [x6, #:tprel_lo12:sym]
+   ldr h7, [x8, #:tprel_lo12_nc:sym]
+// CHECK: ldrh w1, [x2, :tprel_lo12:sym]
+// CHECK: ldrsh w3, [x4, :tprel_lo12_nc:sym]
+// CHECK: ldrsh x5, [x6, :tprel_lo12:sym]
+// CHECK: ldr h7, [x8, :tprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC sym
+
+   ldr w1, [x2, #:lo12:sym]
+   ldrsw x3, [x4, #:lo12:sym]
+   ldr s4, [x5, #:lo12:sym]
+// CHECK: ldr w1, [x2, :lo12:sym]
+// CHECK: ldrsw x3, [x4, :lo12:sym]
+// CHECK: ldr s4, [x5, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST32_ABS_LO12_NC sym
+
+   ldr w1, [x2, #:dtprel_lo12:sym]
+   ldrsw x3, [x4, #:dtprel_lo12_nc:sym]
+   ldr s4, [x5, #:dtprel_lo12_nc:sym]
+// CHECK: ldr w1, [x2, :dtprel_lo12:sym]
+// CHECK: ldrsw x3, [x4, :dtprel_lo12_nc:sym]
+// CHECK: ldr s4, [x5, :dtprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC sym
+
+
+   ldr w1, [x2, #:tprel_lo12:sym]
+   ldrsw x3, [x4, #:tprel_lo12_nc:sym]
+   ldr s4, [x5, #:tprel_lo12_nc:sym]
+// CHECK: ldr w1, [x2, :tprel_lo12:sym]
+// CHECK: ldrsw x3, [x4, :tprel_lo12_nc:sym]
+// CHECK: ldr s4, [x5, :tprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC sym
+
+   ldr x28, [x27, #:lo12:sym]
+   ldr d26, [x25, #:lo12:sym]
+// CHECK: ldr x28, [x27, :lo12:sym]
+// CHECK: ldr d26, [x25, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST64_ABS_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LDST64_ABS_LO12_NC sym
+
+   ldr x24, [x23, #:got_lo12:sym]
+   ldr d22, [x21, #:got_lo12:sym]
+// CHECK: ldr x24, [x23, :got_lo12:sym]
+// CHECK: ldr d22, [x21, :got_lo12:sym]
+// CHECK-OBJ: R_AARCH64_LD64_GOT_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_LD64_GOT_LO12_NC sym
+
+   ldr x24, [x23, #:dtprel_lo12_nc:sym]
+   ldr d22, [x21, #:dtprel_lo12:sym]
+// CHECK: ldr x24, [x23, :dtprel_lo12_nc:sym]
+// CHECK: ldr d22, [x21, :dtprel_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSLD_LDST64_DTPREL_LO12 sym
+
+   ldr x24, [x23, #:tprel_lo12:sym]
+   ldr d22, [x21, #:tprel_lo12_nc:sym]
+// CHECK: ldr x24, [x23, :tprel_lo12:sym]
+// CHECK: ldr d22, [x21, :tprel_lo12_nc:sym]
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST64_TPREL_LO12 sym
+// CHECK-OBJ: R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC sym
+
+   ldr x24, [x23, #:gottprel_lo12:sym]
+   ldr d22, [x21, #:gottprel_lo12:sym]
+// CHECK: ldr x24, [x23, :gottprel_lo12:sym]
+// CHECK: ldr d22, [x21, :gottprel_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC sym
+
+   ldr x24, [x23, #:tlsdesc_lo12:sym]
+   ldr d22, [x21, #:tlsdesc_lo12:sym]
+// CHECK: ldr x24, [x23, :tlsdesc_lo12:sym]
+// CHECK: ldr d22, [x21, :tlsdesc_lo12:sym]
+// CHECK-OBJ: R_AARCH64_TLSDESC_LD64_LO12_NC sym
+// CHECK-OBJ: R_AARCH64_TLSDESC_LD64_LO12_NC sym
+
+   ldr q20, [x19, #:lo12:sym]
+// CHECK: ldr q20, [x19, :lo12:sym]
+// CHECK-OBJ: R_AARCH64_LDST128_ABS_LO12_NC sym
+
+// Since relocated instructions print without a '#', that syntax should
+// certainly be accepted when assembling.
+   add x3, x5, :lo12:imm
+// CHECK: add x3, x5, :lo12:imm
diff --git a/test/MC/ARM64/fp-encoding.s b/test/MC/ARM64/fp-encoding.s
new file mode 100644
index 0000000000..25474c1153
--- /dev/null
+++ b/test/MC/ARM64/fp-encoding.s
@@ -0,0 +1,507 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+;-----------------------------------------------------------------------------
+; Floating-point arithmetic
+;-----------------------------------------------------------------------------
+
+  fabs s1, s2
+  fabs d1, d2
+
+; CHECK: fabs s1, s2                 ; encoding: [0x41,0xc0,0x20,0x1e]
+; CHECK: fabs d1, d2                 ; encoding: [0x41,0xc0,0x60,0x1e]
+
+  fadd s1, s2, s3
+  fadd d1, d2, d3
+
+; CHECK: fadd s1, s2, s3             ; encoding: [0x41,0x28,0x23,0x1e]
+; CHECK: fadd d1, d2, d3             ; encoding: [0x41,0x28,0x63,0x1e]
+
+  fdiv s1, s2, s3
+  fdiv d1, d2, d3
+
+; CHECK: fdiv s1, s2, s3             ; encoding: [0x41,0x18,0x23,0x1e]
+; CHECK: fdiv d1, d2, d3             ; encoding: [0x41,0x18,0x63,0x1e]
+
+  fmadd s1, s2, s3, s4
+  fmadd d1, d2, d3, d4
+
+; CHECK: fmadd s1, s2, s3, s4        ; encoding: [0x41,0x10,0x03,0x1f]
+; CHECK: fmadd d1, d2, d3, d4        ; encoding: [0x41,0x10,0x43,0x1f]
+
+  fmax   s1, s2, s3
+  fmax   d1, d2, d3
+  fmaxnm s1, s2, s3
+  fmaxnm d1, d2, d3
+
+; CHECK: fmax   s1, s2, s3           ; encoding: [0x41,0x48,0x23,0x1e]
+; CHECK: fmax   d1, d2, d3           ; encoding: [0x41,0x48,0x63,0x1e]
+; CHECK: fmaxnm s1, s2, s3           ; encoding: [0x41,0x68,0x23,0x1e]
+; CHECK: fmaxnm d1, d2, d3           ; encoding: [0x41,0x68,0x63,0x1e]
+
+  fmin   s1, s2, s3
+  fmin   d1, d2, d3
+  fminnm s1, s2, s3
+  fminnm d1, d2, d3
+
+; CHECK: fmin   s1, s2, s3           ; encoding: [0x41,0x58,0x23,0x1e]
+; CHECK: fmin   d1, d2, d3           ; encoding: [0x41,0x58,0x63,0x1e]
+; CHECK: fminnm s1, s2, s3           ; encoding: [0x41,0x78,0x23,0x1e]
+; CHECK: fminnm d1, d2, d3           ; encoding: [0x41,0x78,0x63,0x1e]
+
+  fmsub s1, s2, s3, s4
+  fmsub d1, d2, d3, d4
+
+; CHECK: fmsub s1, s2, s3, s4        ; encoding: [0x41,0x90,0x03,0x1f]
+; CHECK: fmsub d1, d2, d3, d4        ; encoding: [0x41,0x90,0x43,0x1f]
+
+  fmul s1, s2, s3
+  fmul d1, d2, d3
+
+; CHECK: fmul s1, s2, s3             ; encoding: [0x41,0x08,0x23,0x1e]
+; CHECK: fmul d1, d2, d3             ; encoding: [0x41,0x08,0x63,0x1e]
+
+  fneg s1, s2
+  fneg d1, d2
+
+; CHECK: fneg s1, s2                 ; encoding: [0x41,0x40,0x21,0x1e]
+; CHECK: fneg d1, d2                 ; encoding: [0x41,0x40,0x61,0x1e]
+
+  fnmadd s1, s2, s3, s4
+  fnmadd d1, d2, d3, d4
+
+; CHECK: fnmadd s1, s2, s3, s4       ; encoding: [0x41,0x10,0x23,0x1f]
+; CHECK: fnmadd d1, d2, d3, d4       ; encoding: [0x41,0x10,0x63,0x1f]
+
+  fnmsub s1, s2, s3, s4
+  fnmsub d1, d2, d3, d4
+
+; CHECK: fnmsub s1, s2, s3, s4       ; encoding: [0x41,0x90,0x23,0x1f]
+; CHECK: fnmsub d1, d2, d3, d4       ; encoding: [0x41,0x90,0x63,0x1f]
+
+  fnmul s1, s2, s3
+  fnmul d1, d2, d3
+
+; CHECK: fnmul s1, s2, s3            ; encoding: [0x41,0x88,0x23,0x1e]
+; CHECK: fnmul d1, d2, d3            ; encoding: [0x41,0x88,0x63,0x1e]
+
+  fsqrt s1, s2
+  fsqrt d1, d2
+
+; CHECK: fsqrt s1, s2                ; encoding: [0x41,0xc0,0x21,0x1e]
+; CHECK: fsqrt d1, d2                ; encoding: [0x41,0xc0,0x61,0x1e]
+
+  fsub s1, s2, s3
+  fsub d1, d2, d3
+
+; CHECK: fsub s1, s2, s3             ; encoding: [0x41,0x38,0x23,0x1e]
+; CHECK: fsub d1, d2, d3             ; encoding: [0x41,0x38,0x63,0x1e]
+
+;-----------------------------------------------------------------------------
+; Floating-point comparison
+;-----------------------------------------------------------------------------
+
+  fccmp  s1, s2, #0, eq
+  fccmp  d1, d2, #0, eq
+  fccmpe s1, s2, #0, eq
+  fccmpe d1, d2, #0, eq
+
+; CHECK: fccmp  s1, s2, #0, eq       ; encoding: [0x20,0x04,0x22,0x1e]
+; CHECK: fccmp  d1, d2, #0, eq       ; encoding: [0x20,0x04,0x62,0x1e]
+; CHECK: fccmpe s1, s2, #0, eq       ; encoding: [0x30,0x04,0x22,0x1e]
+; CHECK: fccmpe d1, d2, #0, eq       ; encoding: [0x30,0x04,0x62,0x1e]
+
+  fcmp  s1, s2
+  fcmp  d1, d2
+  fcmp  s1, #0.0
+  fcmp  d1, #0.0
+  fcmpe s1, s2
+  fcmpe d1, d2
+  fcmpe s1, #0.0
+  fcmpe d1, #0.0
+
+; CHECK: fcmp  s1, s2                ; encoding: [0x20,0x20,0x22,0x1e]
+; CHECK: fcmp  d1, d2                ; encoding: [0x20,0x20,0x62,0x1e]
+; CHECK: fcmp  s1, #0.0              ; encoding: [0x28,0x20,0x20,0x1e]
+; CHECK: fcmp  d1, #0.0              ; encoding: [0x28,0x20,0x60,0x1e]
+; CHECK: fcmpe s1, s2                ; encoding: [0x30,0x20,0x22,0x1e]
+; CHECK: fcmpe d1, d2                ; encoding: [0x30,0x20,0x62,0x1e]
+; CHECK: fcmpe s1, #0.0              ; encoding: [0x38,0x20,0x20,0x1e]
+; CHECK: fcmpe d1, #0.0              ; encoding: [0x38,0x20,0x60,0x1e]
+
+;-----------------------------------------------------------------------------
+; Floating-point conditional select
+;-----------------------------------------------------------------------------
+
+  fcsel s1, s2, s3, eq
+  fcsel d1, d2, d3, eq
+
+; CHECK: fcsel s1, s2, s3, eq        ; encoding: [0x41,0x0c,0x23,0x1e]
+; CHECK: fcsel d1, d2, d3, eq        ; encoding: [0x41,0x0c,0x63,0x1e]
+
+;-----------------------------------------------------------------------------
+; Floating-point convert
+;-----------------------------------------------------------------------------
+
+  fcvt h1, d2
+  fcvt s1, d2
+  fcvt d1, h2
+  fcvt s1, h2
+  fcvt d1, s2
+  fcvt h1, s2
+
+; CHECK: fcvt h1, d2                 ; encoding: [0x41,0xc0,0x63,0x1e]
+; CHECK: fcvt s1, d2                 ; encoding: [0x41,0x40,0x62,0x1e]
+; CHECK: fcvt d1, h2                 ; encoding: [0x41,0xc0,0xe2,0x1e]
+; CHECK: fcvt s1, h2                 ; encoding: [0x41,0x40,0xe2,0x1e]
+; CHECK: fcvt d1, s2                 ; encoding: [0x41,0xc0,0x22,0x1e]
+; CHECK: fcvt h1, s2                 ; encoding: [0x41,0xc0,0x23,0x1e]
+
+  fcvtas w1, d2
+  fcvtas w1, d2, #1
+  fcvtas x1, d2
+  fcvtas x1, d2, #1
+  fcvtas w1, s2
+  fcvtas w1, s2, #1
+  fcvtas x1, s2
+  fcvtas x1, s2, #1
+
+; CHECK: fcvtas	w1, d2                  ; encoding: [0x41,0x00,0x64,0x1e]
+; CHECK: fcvtas	w1, d2, #1              ; encoding: [0x41,0xfc,0x44,0x1e]
+; CHECK: fcvtas	x1, d2                  ; encoding: [0x41,0x00,0x64,0x9e]
+; CHECK: fcvtas	x1, d2, #1              ; encoding: [0x41,0xfc,0x44,0x9e]
+; CHECK: fcvtas	w1, s2                  ; encoding: [0x41,0x00,0x24,0x1e]
+; CHECK: fcvtas	w1, s2, #1              ; encoding: [0x41,0xfc,0x04,0x1e]
+; CHECK: fcvtas	x1, s2                  ; encoding: [0x41,0x00,0x24,0x9e]
+; CHECK: fcvtas	x1, s2, #1              ; encoding: [0x41,0xfc,0x04,0x9e]
+
+  fcvtau w1, s2
+  fcvtau w1, s2, #1
+  fcvtau w1, d2
+  fcvtau w1, d2, #1
+  fcvtau x1, s2
+  fcvtau x1, s2, #1
+  fcvtau x1, d2
+  fcvtau x1, d2, #1
+
+; CHECK: fcvtau	w1, s2                  ; encoding: [0x41,0x00,0x25,0x1e]
+; CHECK: fcvtau	w1, s2, #1              ; encoding: [0x41,0xfc,0x05,0x1e]
+; CHECK: fcvtau	w1, d2                  ; encoding: [0x41,0x00,0x65,0x1e]
+; CHECK: fcvtau	w1, d2, #1              ; encoding: [0x41,0xfc,0x45,0x1e]
+; CHECK: fcvtau	x1, s2                  ; encoding: [0x41,0x00,0x25,0x9e]
+; CHECK: fcvtau	x1, s2, #1              ; encoding: [0x41,0xfc,0x05,0x9e]
+; CHECK: fcvtau	x1, d2                  ; encoding: [0x41,0x00,0x65,0x9e]
+; CHECK: fcvtau	x1, d2, #1              ; encoding: [0x41,0xfc,0x45,0x9e]
+
+  fcvtms w1, s2
+  fcvtms w1, s2, #1
+  fcvtms w1, d2
+  fcvtms w1, d2, #1
+  fcvtms x1, s2
+  fcvtms x1, s2, #1
+  fcvtms x1, d2
+  fcvtms x1, d2, #1
+
+; CHECK: fcvtms	w1, s2                  ; encoding: [0x41,0x00,0x30,0x1e]
+; CHECK: fcvtms	w1, s2, #1              ; encoding: [0x41,0xfc,0x10,0x1e]
+; CHECK: fcvtms	w1, d2                  ; encoding: [0x41,0x00,0x70,0x1e]
+; CHECK: fcvtms	w1, d2, #1              ; encoding: [0x41,0xfc,0x50,0x1e]
+; CHECK: fcvtms	x1, s2                  ; encoding: [0x41,0x00,0x30,0x9e]
+; CHECK: fcvtms	x1, s2, #1              ; encoding: [0x41,0xfc,0x10,0x9e]
+; CHECK: fcvtms	x1, d2                  ; encoding: [0x41,0x00,0x70,0x9e]
+; CHECK: fcvtms	x1, d2, #1              ; encoding: [0x41,0xfc,0x50,0x9e]
+
+  fcvtmu w1, s2
+  fcvtmu w1, s2, #1
+  fcvtmu w1, d2
+  fcvtmu w1, d2, #1
+  fcvtmu x1, s2
+  fcvtmu x1, s2, #1
+  fcvtmu x1, d2
+  fcvtmu x1, d2, #1
+
+; CHECK: fcvtmu	w1, s2                  ; encoding: [0x41,0x00,0x31,0x1e]
+; CHECK: fcvtmu	w1, s2, #1              ; encoding: [0x41,0xfc,0x11,0x1e]
+; CHECK: fcvtmu	w1, d2                  ; encoding: [0x41,0x00,0x71,0x1e]
+; CHECK: fcvtmu	w1, d2, #1              ; encoding: [0x41,0xfc,0x51,0x1e]
+; CHECK: fcvtmu	x1, s2                  ; encoding: [0x41,0x00,0x31,0x9e]
+; CHECK: fcvtmu	x1, s2, #1              ; encoding: [0x41,0xfc,0x11,0x9e]
+; CHECK: fcvtmu	x1, d2                  ; encoding: [0x41,0x00,0x71,0x9e]
+; CHECK: fcvtmu	x1, d2, #1              ; encoding: [0x41,0xfc,0x51,0x9e]
+
+  fcvtns w1, s2
+  fcvtns w1, s2, #1
+  fcvtns w1, d2
+  fcvtns w1, d2, #1
+  fcvtns x1, s2
+  fcvtns x1, s2, #1
+  fcvtns x1, d2
+  fcvtns x1, d2, #1
+
+; CHECK: fcvtns	w1, s2                  ; encoding: [0x41,0x00,0x20,0x1e]
+; CHECK: fcvtns	w1, s2, #1              ; encoding: [0x41,0xfc,0x00,0x1e]
+; CHECK: fcvtns	w1, d2                  ; encoding: [0x41,0x00,0x60,0x1e]
+; CHECK: fcvtns	w1, d2, #1              ; encoding: [0x41,0xfc,0x40,0x1e]
+; CHECK: fcvtns	x1, s2                  ; encoding: [0x41,0x00,0x20,0x9e]
+; CHECK: fcvtns	x1, s2, #1              ; encoding: [0x41,0xfc,0x00,0x9e]
+; CHECK: fcvtns	x1, d2                  ; encoding: [0x41,0x00,0x60,0x9e]
+; CHECK: fcvtns	x1, d2, #1              ; encoding: [0x41,0xfc,0x40,0x9e]
+
+  fcvtnu w1, s2
+  fcvtnu w1, s2, #1
+  fcvtnu w1, d2
+  fcvtnu w1, d2, #1
+  fcvtnu x1, s2
+  fcvtnu x1, s2, #1
+  fcvtnu x1, d2
+  fcvtnu x1, d2, #1
+
+; CHECK: fcvtnu	w1, s2                  ; encoding: [0x41,0x00,0x21,0x1e]
+; CHECK: fcvtnu	w1, s2, #1              ; encoding: [0x41,0xfc,0x01,0x1e]
+; CHECK: fcvtnu	w1, d2                  ; encoding: [0x41,0x00,0x61,0x1e]
+; CHECK: fcvtnu	w1, d2, #1              ; encoding: [0x41,0xfc,0x41,0x1e]
+; CHECK: fcvtnu	x1, s2                  ; encoding: [0x41,0x00,0x21,0x9e]
+; CHECK: fcvtnu	x1, s2, #1              ; encoding: [0x41,0xfc,0x01,0x9e]
+; CHECK: fcvtnu	x1, d2                  ; encoding: [0x41,0x00,0x61,0x9e]
+; CHECK: fcvtnu	x1, d2, #1              ; encoding: [0x41,0xfc,0x41,0x9e]
+
+  fcvtps w1, s2
+  fcvtps w1, s2, #1
+  fcvtps w1, d2
+  fcvtps w1, d2, #1
+  fcvtps x1, s2
+  fcvtps x1, s2, #1
+  fcvtps x1, d2
+  fcvtps x1, d2, #1
+
+; CHECK: fcvtps	w1, s2                  ; encoding: [0x41,0x00,0x28,0x1e]
+; CHECK: fcvtps	w1, s2, #1              ; encoding: [0x41,0xfc,0x08,0x1e]
+; CHECK: fcvtps	w1, d2                  ; encoding: [0x41,0x00,0x68,0x1e]
+; CHECK: fcvtps	w1, d2, #1              ; encoding: [0x41,0xfc,0x48,0x1e]
+; CHECK: fcvtps	x1, s2                  ; encoding: [0x41,0x00,0x28,0x9e]
+; CHECK: fcvtps	x1, s2, #1              ; encoding: [0x41,0xfc,0x08,0x9e]
+; CHECK: fcvtps	x1, d2                  ; encoding: [0x41,0x00,0x68,0x9e]
+; CHECK: fcvtps	x1, d2, #1              ; encoding: [0x41,0xfc,0x48,0x9e]
+
+  fcvtpu w1, s2
+  fcvtpu w1, s2, #1
+  fcvtpu w1, d2
+  fcvtpu w1, d2, #1
+  fcvtpu x1, s2
+  fcvtpu x1, s2, #1
+  fcvtpu x1, d2
+  fcvtpu x1, d2, #1
+
+; CHECK: fcvtpu	w1, s2                  ; encoding: [0x41,0x00,0x29,0x1e]
+; CHECK: fcvtpu	w1, s2, #1              ; encoding: [0x41,0xfc,0x09,0x1e]
+; CHECK: fcvtpu	w1, d2                  ; encoding: [0x41,0x00,0x69,0x1e]
+; CHECK: fcvtpu	w1, d2, #1              ; encoding: [0x41,0xfc,0x49,0x1e]
+; CHECK: fcvtpu	x1, s2                  ; encoding: [0x41,0x00,0x29,0x9e]
+; CHECK: fcvtpu	x1, s2, #1              ; encoding: [0x41,0xfc,0x09,0x9e]
+; CHECK: fcvtpu	x1, d2                  ; encoding: [0x41,0x00,0x69,0x9e]
+; CHECK: fcvtpu	x1, d2, #1              ; encoding: [0x41,0xfc,0x49,0x9e]
+
+  fcvtzs w1, s2
+  fcvtzs w1, s2, #1
+  fcvtzs w1, d2
+  fcvtzs w1, d2, #1
+  fcvtzs x1, s2
+  fcvtzs x1, s2, #1
+  fcvtzs x1, d2
+  fcvtzs x1, d2, #1
+
+; CHECK: fcvtzs	w1, s2                  ; encoding: [0x41,0x00,0x38,0x1e]
+; CHECK: fcvtzs	w1, s2, #1              ; encoding: [0x41,0xfc,0x18,0x1e]
+; CHECK: fcvtzs	w1, d2                  ; encoding: [0x41,0x00,0x78,0x1e]
+; CHECK: fcvtzs	w1, d2, #1              ; encoding: [0x41,0xfc,0x58,0x1e]
+; CHECK: fcvtzs	x1, s2                  ; encoding: [0x41,0x00,0x38,0x9e]
+; CHECK: fcvtzs	x1, s2, #1              ; encoding: [0x41,0xfc,0x18,0x9e]
+; CHECK: fcvtzs	x1, d2                  ; encoding: [0x41,0x00,0x78,0x9e]
+; CHECK: fcvtzs	x1, d2, #1              ; encoding: [0x41,0xfc,0x58,0x9e]
+
+  fcvtzu w1, s2
+  fcvtzu w1, s2, #1
+  fcvtzu w1, d2
+  fcvtzu w1, d2, #1
+  fcvtzu x1, s2
+  fcvtzu x1, s2, #1
+  fcvtzu x1, d2
+  fcvtzu x1, d2, #1
+
+; CHECK: fcvtzu	w1, s2                  ; encoding: [0x41,0x00,0x39,0x1e]
+; CHECK: fcvtzu	w1, s2, #1              ; encoding: [0x41,0xfc,0x19,0x1e]
+; CHECK: fcvtzu	w1, d2                  ; encoding: [0x41,0x00,0x79,0x1e]
+; CHECK: fcvtzu	w1, d2, #1              ; encoding: [0x41,0xfc,0x59,0x1e]
+; CHECK: fcvtzu	x1, s2                  ; encoding: [0x41,0x00,0x39,0x9e]
+; CHECK: fcvtzu	x1, s2, #1              ; encoding: [0x41,0xfc,0x19,0x9e]
+; CHECK: fcvtzu	x1, d2                  ; encoding: [0x41,0x00,0x79,0x9e]
+; CHECK: fcvtzu	x1, d2, #1              ; encoding: [0x41,0xfc,0x59,0x9e]
+
+  scvtf s1, w2
+  scvtf s1, w2, #1
+  scvtf d1, w2
+  scvtf d1, w2, #1
+  scvtf s1, x2
+  scvtf s1, x2, #1
+  scvtf d1, x2
+  scvtf d1, x2, #1
+
+; CHECK: scvtf	s1, w2                  ; encoding: [0x41,0x00,0x22,0x1e]
+; CHECK: scvtf	s1, w2, #1              ; encoding: [0x41,0xfc,0x02,0x1e]
+; CHECK: scvtf	d1, w2                  ; encoding: [0x41,0x00,0x62,0x1e]
+; CHECK: scvtf	d1, w2, #1              ; encoding: [0x41,0xfc,0x42,0x1e]
+; CHECK: scvtf	s1, x2                  ; encoding: [0x41,0x00,0x22,0x9e]
+; CHECK: scvtf	s1, x2, #1              ; encoding: [0x41,0xfc,0x02,0x9e]
+; CHECK: scvtf	d1, x2                  ; encoding: [0x41,0x00,0x62,0x9e]
+; CHECK: scvtf	d1, x2, #1              ; encoding: [0x41,0xfc,0x42,0x9e]
+
+  ucvtf s1, w2
+  ucvtf s1, w2, #1
+  ucvtf d1, w2
+  ucvtf d1, w2, #1
+  ucvtf s1, x2
+  ucvtf s1, x2, #1
+  ucvtf d1, x2
+  ucvtf d1, x2, #1
+
+; CHECK: ucvtf	s1, w2                  ; encoding: [0x41,0x00,0x23,0x1e]
+; CHECK: ucvtf	s1, w2, #1              ; encoding: [0x41,0xfc,0x03,0x1e]
+; CHECK: ucvtf	d1, w2                  ; encoding: [0x41,0x00,0x63,0x1e]
+; CHECK: ucvtf	d1, w2, #1              ; encoding: [0x41,0xfc,0x43,0x1e]
+; CHECK: ucvtf	s1, x2                  ; encoding: [0x41,0x00,0x23,0x9e]
+; CHECK: ucvtf	s1, x2, #1              ; encoding: [0x41,0xfc,0x03,0x9e]
+; CHECK: ucvtf	d1, x2                  ; encoding: [0x41,0x00,0x63,0x9e]
+; CHECK: ucvtf	d1, x2, #1              ; encoding: [0x41,0xfc,0x43,0x9e]
+
+;-----------------------------------------------------------------------------
+; Floating-point move
+;-----------------------------------------------------------------------------
+
+  fmov s1, w2
+  fmov w1, s2
+  fmov d1, x2
+  fmov x1, d2
+
+; CHECK: fmov s1, w2                 ; encoding: [0x41,0x00,0x27,0x1e]
+; CHECK: fmov w1, s2                 ; encoding: [0x41,0x00,0x26,0x1e]
+; CHECK: fmov d1, x2                 ; encoding: [0x41,0x00,0x67,0x9e]
+; CHECK: fmov x1, d2                 ; encoding: [0x41,0x00,0x66,0x9e]
+
+  fmov s1, #0.125
+  fmov s1, #0x40
+  fmov d1, #0.125
+  fmov d1, #0x40
+  fmov d1, #-4.843750e-01
+  fmov d1, #4.843750e-01
+  fmov d3, #3
+  fmov s2, #0.0
+  fmov d2, #0.0
+
+; CHECK: fmov s1, #1.250000e-01      ; encoding: [0x01,0x10,0x28,0x1e]
+; CHECK: fmov s1, #1.250000e-01      ; encoding: [0x01,0x10,0x28,0x1e]
+; CHECK: fmov d1, #1.250000e-01      ; encoding: [0x01,0x10,0x68,0x1e]
+; CHECK: fmov d1, #1.250000e-01      ; encoding: [0x01,0x10,0x68,0x1e]
+; CHECK: fmov d1, #-4.843750e-01     ; encoding: [0x01,0xf0,0x7b,0x1e]
+; CHECK: fmov d1, #4.843750e-01      ; encoding: [0x01,0xf0,0x6b,0x1e]
+; CHECK: fmov d3, #3.000000e+00      ; encoding: [0x03,0x10,0x61,0x1e]
+; CHECK: fmov s2, wzr                ; encoding: [0xe2,0x03,0x27,0x1e]
+; CHECK: fmov d2, xzr                ; encoding: [0xe2,0x03,0x67,0x9e]
+
+  fmov s1, s2
+  fmov d1, d2
+
+; CHECK: fmov s1, s2                 ; encoding: [0x41,0x40,0x20,0x1e]
+; CHECK: fmov d1, d2                 ; encoding: [0x41,0x40,0x60,0x1e]
+
+
+  fmov x2, v5.d[1]
+  fmov.d x9, v7[1]
+  fmov v1.d[1], x1
+  fmov.d v8[1], x6
+
+; CHECK: fmov.d	x2, v5[1]               ; encoding: [0xa2,0x00,0xae,0x9e]
+; CHECK: fmov.d	x9, v7[1]               ; encoding: [0xe9,0x00,0xae,0x9e]
+; CHECK: fmov.d	v1[1], x1               ; encoding: [0x21,0x00,0xaf,0x9e]
+; CHECK: fmov.d	v8[1], x6               ; encoding: [0xc8,0x00,0xaf,0x9e]
+
+
+;-----------------------------------------------------------------------------
+; Floating-point round to integral
+;-----------------------------------------------------------------------------
+
+  frinta s1, s2
+  frinta d1, d2
+
+; CHECK: frinta s1, s2               ; encoding: [0x41,0x40,0x26,0x1e]
+; CHECK: frinta d1, d2               ; encoding: [0x41,0x40,0x66,0x1e]
+
+  frinti s1, s2
+  frinti d1, d2
+
+; CHECK: frinti s1, s2               ; encoding: [0x41,0xc0,0x27,0x1e]
+; CHECK: frinti d1, d2               ; encoding: [0x41,0xc0,0x67,0x1e]
+
+  frintm s1, s2
+  frintm d1, d2
+
+; CHECK: frintm s1, s2               ; encoding: [0x41,0x40,0x25,0x1e]
+; CHECK: frintm d1, d2               ; encoding: [0x41,0x40,0x65,0x1e]
+
+  frintn s1, s2
+  frintn d1, d2
+
+; CHECK: frintn s1, s2               ; encoding: [0x41,0x40,0x24,0x1e]
+; CHECK: frintn d1, d2               ; encoding: [0x41,0x40,0x64,0x1e]
+
+  frintp s1, s2
+  frintp d1, d2
+
+; CHECK: frintp s1, s2               ; encoding: [0x41,0xc0,0x24,0x1e]
+; CHECK: frintp d1, d2               ; encoding: [0x41,0xc0,0x64,0x1e]
+
+  frintx s1, s2
+  frintx d1, d2
+
+; CHECK: frintx s1, s2               ; encoding: [0x41,0x40,0x27,0x1e]
+; CHECK: frintx d1, d2               ; encoding: [0x41,0x40,0x67,0x1e]
+
+  frintz s1, s2
+  frintz d1, d2
+
+; CHECK: frintz s1, s2               ; encoding: [0x41,0xc0,0x25,0x1e]
+; CHECK: frintz d1, d2               ; encoding: [0x41,0xc0,0x65,0x1e]
+
+  cmhs d0, d0, d0
+  cmtst d0, d0, d0
+
+; CHECK: cmhs	d0, d0, d0              ; encoding: [0x00,0x3c,0xe0,0x7e]
+; CHECK: cmtst	d0, d0, d0              ; encoding: [0x00,0x8c,0xe0,0x5e]
+
+
+
+;-----------------------------------------------------------------------------
+; Floating-point extract and narrow
+;-----------------------------------------------------------------------------
+  sqxtn b4, h2
+  sqxtn h2, s3
+  sqxtn s9, d2
+
+; CHECK: sqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x5e]
+; CHECK: sqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x5e]
+; CHECK: sqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x5e]
+
+  sqxtun b4, h2
+  sqxtun h2, s3
+  sqxtun s9, d2
+
+; CHECK: sqxtun b4, h2                  ; encoding: [0x44,0x28,0x21,0x7e]
+; CHECK: sqxtun h2, s3                  ; encoding: [0x62,0x28,0x61,0x7e]
+; CHECK: sqxtun s9, d2                  ; encoding: [0x49,0x28,0xa1,0x7e]
+
+  uqxtn b4, h2
+  uqxtn h2, s3
+  uqxtn s9, d2
+
+; CHECK: uqxtn b4, h2                  ; encoding: [0x44,0x48,0x21,0x7e]
+; CHECK: uqxtn h2, s3                  ; encoding: [0x62,0x48,0x61,0x7e]
+; CHECK: uqxtn s9, d2                  ; encoding: [0x49,0x48,0xa1,0x7e]
diff --git a/test/MC/ARM64/large-relocs.s b/test/MC/ARM64/large-relocs.s
new file mode 100644
index 0000000000..348ceb6db5
--- /dev/null
+++ b/test/MC/ARM64/large-relocs.s
@@ -0,0 +1,38 @@
+// RUN: llvm-mc -triple=arm64-linux-gnu -show-encoding -o - %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64-linux-gnu -show-encoding -filetype=obj -o - %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-OBJ %s
+
+        movz x2, #:abs_g0:sym
+        movk w3, #:abs_g0_nc:sym
+// CHECK: movz    x2, #:abs_g0:sym        // encoding: [0bAAA00010,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0:sym, kind: fixup_arm64_movw
+// CHECK: movk     w3, #:abs_g0_nc:sym    // encoding: [0bAAA00011,A,0b100AAAAA,0x72]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g0_nc:sym, kind: fixup_arm64_movw
+
+// CHECK-OBJ: 0 R_AARCH64_MOVW_UABS_G0 sym
+// CHECK-OBJ: 4 R_AARCH64_MOVW_UABS_G0_NC sym
+
+        movz x4, #:abs_g1:sym
+        movk w5, #:abs_g1_nc:sym
+// CHECK: movz     x4, #:abs_g1:sym       // encoding: [0bAAA00100,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1:sym, kind: fixup_arm64_movw
+// CHECK: movk     w5, #:abs_g1_nc:sym    // encoding: [0bAAA00101,A,0b101AAAAA,0x72]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g1_nc:sym, kind: fixup_arm64_movw
+
+// CHECK-OBJ: 8 R_AARCH64_MOVW_UABS_G1 sym
+// CHECK-OBJ: c R_AARCH64_MOVW_UABS_G1_NC sym
+
+        movz x6, #:abs_g2:sym
+        movk x7, #:abs_g2_nc:sym
+// CHECK: movz     x6, #:abs_g2:sym       // encoding: [0bAAA00110,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2:sym, kind: fixup_arm64_movw
+// CHECK: movk     x7, #:abs_g2_nc:sym    // encoding: [0bAAA00111,A,0b110AAAAA,0xf2]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g2_nc:sym, kind: fixup_arm64_movw
+
+// CHECK-OBJ: 10 R_AARCH64_MOVW_UABS_G2 sym
+// CHECK-OBJ: 14 R_AARCH64_MOVW_UABS_G2_NC sym
+
+        movz x8, #:abs_g3:sym
+// CHECK: movz     x8, #:abs_g3:sym       // encoding: [0bAAA01000,A,0b111AAAAA,0x92]
+// CHECK-NEXT:                            //   fixup A - offset: 0, value: :abs_g3:sym, kind: fixup_arm64_movw
+
+// CHECK-OBJ: 18 R_AARCH64_MOVW_UABS_G3 sym
diff --git a/test/MC/ARM64/lit.local.cfg b/test/MC/ARM64/lit.local.cfg
new file mode 100644
index 0000000000..49447af369
--- /dev/null
+++ b/test/MC/ARM64/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp', '.s']
+
+targets = set(config.root.targets_to_build.split())
+if not 'ARM64' in targets:
+    config.unsupported = True
+
diff --git a/test/MC/ARM64/logical-encoding.s b/test/MC/ARM64/logical-encoding.s
new file mode 100644
index 0000000000..e5f1436d1a
--- /dev/null
+++ b/test/MC/ARM64/logical-encoding.s
@@ -0,0 +1,224 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+;==---------------------------------------------------------------------------==
+; 5.4.2 Logical (immediate)
+;==---------------------------------------------------------------------------==
+
+  and   w0, w0, #1
+  and   x0, x0, #1
+  and   w1, w2, #15
+  and   x1, x2, #15
+  and   sp, x5, #~15
+  ands  w0, w0, #1
+  ands  x0, x0, #1
+  ands  w1, w2, #15
+  ands  x1, x2, #15
+
+; CHECK: and  w0, w0, #0x1           ; encoding: [0x00,0x00,0x00,0x12]
+; CHECK: and  x0, x0, #0x1           ; encoding: [0x00,0x00,0x40,0x92]
+; CHECK: and  w1, w2, #0xf           ; encoding: [0x41,0x0c,0x00,0x12]
+; CHECK: and  x1, x2, #0xf           ; encoding: [0x41,0x0c,0x40,0x92]
+; CHECK: and  sp, x5, #0xfffffffffffffff0 ; encoding: [0xbf,0xec,0x7c,0x92]
+; CHECK: ands w0, w0, #0x1           ; encoding: [0x00,0x00,0x00,0x72]
+; CHECK: ands x0, x0, #0x1           ; encoding: [0x00,0x00,0x40,0xf2]
+; CHECK: ands w1, w2, #0xf           ; encoding: [0x41,0x0c,0x00,0x72]
+; CHECK: ands x1, x2, #0xf           ; encoding: [0x41,0x0c,0x40,0xf2]
+
+  eor w1, w2, #0x4000
+  eor x1, x2, #0x8000
+
+; CHECK: eor w1, w2, #0x4000         ; encoding: [0x41,0x00,0x12,0x52]
+; CHECK: eor x1, x2, #0x8000         ; encoding: [0x41,0x00,0x71,0xd2]
+
+  orr w1, w2, #0x4000
+  orr x1, x2, #0x8000
+
+; CHECK: orr w1, w2, #0x4000         ; encoding: [0x41,0x00,0x12,0x32]
+; CHECK: orr x1, x2, #0x8000         ; encoding: [0x41,0x00,0x71,0xb2]
+
+  orr w8, wzr, #0x1
+  orr x8, xzr, #0x1
+
+; CHECK: orr w8, wzr, #0x1           ; encoding: [0xe8,0x03,0x00,0x32]
+; CHECK: orr x8, xzr, #0x1           ; encoding: [0xe8,0x03,0x40,0xb2]
+
+;==---------------------------------------------------------------------------==
+; 5.5.3 Logical (shifted register)
+;==---------------------------------------------------------------------------==
+
+  and   w1, w2, w3
+  and   x1, x2, x3
+  and   w1, w2, w3, lsl #2
+  and   x1, x2, x3, lsl #2
+  and   w1, w2, w3, lsr #2
+  and   x1, x2, x3, lsr #2
+  and   w1, w2, w3, asr #2
+  and   x1, x2, x3, asr #2
+  and   w1, w2, w3, ror #2
+  and   x1, x2, x3, ror #2
+
+; CHECK: and  w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x0a]
+; CHECK: and  x1, x2, x3             ; encoding: [0x41,0x00,0x03,0x8a]
+; CHECK: and  w1, w2, w3, lsl #2     ; encoding: [0x41,0x08,0x03,0x0a]
+; CHECK: and  x1, x2, x3, lsl #2     ; encoding: [0x41,0x08,0x03,0x8a]
+; CHECK: and  w1, w2, w3, lsr #2     ; encoding: [0x41,0x08,0x43,0x0a]
+; CHECK: and  x1, x2, x3, lsr #2     ; encoding: [0x41,0x08,0x43,0x8a]
+; CHECK: and  w1, w2, w3, asr #2     ; encoding: [0x41,0x08,0x83,0x0a]
+; CHECK: and  x1, x2, x3, asr #2     ; encoding: [0x41,0x08,0x83,0x8a]
+; CHECK: and  w1, w2, w3, ror #2     ; encoding: [0x41,0x08,0xc3,0x0a]
+; CHECK: and  x1, x2, x3, ror #2     ; encoding: [0x41,0x08,0xc3,0x8a]
+
+  ands  w1, w2, w3
+  ands  x1, x2, x3
+  ands  w1, w2, w3, lsl #2
+  ands  x1, x2, x3, lsl #2
+  ands  w1, w2, w3, lsr #2
+  ands  x1, x2, x3, lsr #2
+  ands  w1, w2, w3, asr #2
+  ands  x1, x2, x3, asr #2
+  ands  w1, w2, w3, ror #2
+  ands  x1, x2, x3, ror #2
+
+; CHECK: ands w1, w2, w3             ; encoding: [0x41,0x00,0x03,0x6a]
+; CHECK: ands x1, x2, x3             ; encoding: [0x41,0x00,0x03,0xea]
+; CHECK: ands w1, w2, w3, lsl #2     ; encoding: [0x41,0x08,0x03,0x6a]
+; CHECK: ands x1, x2, x3, lsl #2     ; encoding: [0x41,0x08,0x03,0xea]
+; CHECK: ands w1, w2, w3, lsr #2     ; encoding: [0x41,0x08,0x43,0x6a]
+; CHECK: ands x1, x2, x3, lsr #2     ; encoding: [0x41,0x08,0x43,0xea]
+; CHECK: ands w1, w2, w3, asr #2     ; encoding: [0x41,0x08,0x83,0x6a]
+; CHECK: ands x1, x2, x3, asr #2     ; encoding: [0x41,0x08,0x83,0xea]
+; CHECK: ands w1, w2, w3, ror #2     ; encoding: [0x41,0x08,0xc3,0x6a]
+; CHECK: ands x1, x2, x3, ror #2     ; encoding: [0x41,0x08,0xc3,0xea]
+
+  bic w1, w2, w3
+  bic x1, x2, x3
+  bic w1, w2, w3, lsl #3
+  bic x1, x2, x3, lsl #3
+  bic w1, w2, w3, lsr #3
+  bic x1, x2, x3, lsr #3
+  bic w1, w2, w3, asr #3
+  bic x1, x2, x3, asr #3
+  bic w1, w2, w3, ror #3
+  bic x1, x2, x3, ror #3
+
+; CHECK: bic w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x0a]
+; CHECK: bic x1, x2, x3              ; encoding: [0x41,0x00,0x23,0x8a]
+; CHECK: bic w1, w2, w3, lsl #3      ; encoding: [0x41,0x0c,0x23,0x0a]
+; CHECK: bic x1, x2, x3, lsl #3      ; encoding: [0x41,0x0c,0x23,0x8a]
+; CHECK: bic w1, w2, w3, lsr #3      ; encoding: [0x41,0x0c,0x63,0x0a]
+; CHECK: bic x1, x2, x3, lsr #3      ; encoding: [0x41,0x0c,0x63,0x8a]
+; CHECK: bic w1, w2, w3, asr #3      ; encoding: [0x41,0x0c,0xa3,0x0a]
+; CHECK: bic x1, x2, x3, asr #3      ; encoding: [0x41,0x0c,0xa3,0x8a]
+; CHECK: bic w1, w2, w3, ror #3      ; encoding: [0x41,0x0c,0xe3,0x0a]
+; CHECK: bic x1, x2, x3, ror #3      ; encoding: [0x41,0x0c,0xe3,0x8a]
+
+  bics w1, w2, w3
+  bics x1, x2, x3
+  bics w1, w2, w3, lsl #3
+  bics x1, x2, x3, lsl #3
+  bics w1, w2, w3, lsr #3
+  bics x1, x2, x3, lsr #3
+  bics w1, w2, w3, asr #3
+  bics x1, x2, x3, asr #3
+  bics w1, w2, w3, ror #3
+  bics x1, x2, x3, ror #3
+
+; CHECK: bics w1, w2, w3             ; encoding: [0x41,0x00,0x23,0x6a]
+; CHECK: bics x1, x2, x3             ; encoding: [0x41,0x00,0x23,0xea]
+; CHECK: bics w1, w2, w3, lsl #3     ; encoding: [0x41,0x0c,0x23,0x6a]
+; CHECK: bics x1, x2, x3, lsl #3     ; encoding: [0x41,0x0c,0x23,0xea]
+; CHECK: bics w1, w2, w3, lsr #3     ; encoding: [0x41,0x0c,0x63,0x6a]
+; CHECK: bics x1, x2, x3, lsr #3     ; encoding: [0x41,0x0c,0x63,0xea]
+; CHECK: bics w1, w2, w3, asr #3     ; encoding: [0x41,0x0c,0xa3,0x6a]
+; CHECK: bics x1, x2, x3, asr #3     ; encoding: [0x41,0x0c,0xa3,0xea]
+; CHECK: bics w1, w2, w3, ror #3     ; encoding: [0x41,0x0c,0xe3,0x6a]
+; CHECK: bics x1, x2, x3, ror #3     ; encoding: [0x41,0x0c,0xe3,0xea]
+
+  eon w1, w2, w3
+  eon x1, x2, x3
+  eon w1, w2, w3, lsl #4
+  eon x1, x2, x3, lsl #4
+  eon w1, w2, w3, lsr #4
+  eon x1, x2, x3, lsr #4
+  eon w1, w2, w3, asr #4
+  eon x1, x2, x3, asr #4
+  eon w1, w2, w3, ror #4
+  eon x1, x2, x3, ror #4
+
+; CHECK: eon w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x4a]
+; CHECK: eon x1, x2, x3              ; encoding: [0x41,0x00,0x23,0xca]
+; CHECK: eon w1, w2, w3, lsl #4      ; encoding: [0x41,0x10,0x23,0x4a]
+; CHECK: eon x1, x2, x3, lsl #4      ; encoding: [0x41,0x10,0x23,0xca]
+; CHECK: eon w1, w2, w3, lsr #4      ; encoding: [0x41,0x10,0x63,0x4a]
+; CHECK: eon x1, x2, x3, lsr #4      ; encoding: [0x41,0x10,0x63,0xca]
+; CHECK: eon w1, w2, w3, asr #4      ; encoding: [0x41,0x10,0xa3,0x4a]
+; CHECK: eon x1, x2, x3, asr #4      ; encoding: [0x41,0x10,0xa3,0xca]
+; CHECK: eon w1, w2, w3, ror #4      ; encoding: [0x41,0x10,0xe3,0x4a]
+; CHECK: eon x1, x2, x3, ror #4      ; encoding: [0x41,0x10,0xe3,0xca]
+
+  eor w1, w2, w3
+  eor x1, x2, x3
+  eor w1, w2, w3, lsl #5
+  eor x1, x2, x3, lsl #5
+  eor w1, w2, w3, lsr #5
+  eor x1, x2, x3, lsr #5
+  eor w1, w2, w3, asr #5
+  eor x1, x2, x3, asr #5
+  eor w1, w2, w3, ror #5
+  eor x1, x2, x3, ror #5
+
+; CHECK: eor w1, w2, w3              ; encoding: [0x41,0x00,0x03,0x4a]
+; CHECK: eor x1, x2, x3              ; encoding: [0x41,0x00,0x03,0xca]
+; CHECK: eor w1, w2, w3, lsl #5      ; encoding: [0x41,0x14,0x03,0x4a]
+; CHECK: eor x1, x2, x3, lsl #5      ; encoding: [0x41,0x14,0x03,0xca]
+; CHECK: eor w1, w2, w3, lsr #5      ; encoding: [0x41,0x14,0x43,0x4a]
+; CHECK: eor x1, x2, x3, lsr #5      ; encoding: [0x41,0x14,0x43,0xca]
+; CHECK: eor w1, w2, w3, asr #5      ; encoding: [0x41,0x14,0x83,0x4a]
+; CHECK: eor x1, x2, x3, asr #5      ; encoding: [0x41,0x14,0x83,0xca]
+; CHECK: eor w1, w2, w3, ror #5      ; encoding: [0x41,0x14,0xc3,0x4a]
+; CHECK: eor x1, x2, x3, ror #5      ; encoding: [0x41,0x14,0xc3,0xca]
+
+  orr w1, w2, w3
+  orr x1, x2, x3
+  orr w1, w2, w3, lsl #6
+  orr x1, x2, x3, lsl #6
+  orr w1, w2, w3, lsr #6
+  orr x1, x2, x3, lsr #6
+  orr w1, w2, w3, asr #6
+  orr x1, x2, x3, asr #6
+  orr w1, w2, w3, ror #6
+  orr x1, x2, x3, ror #6
+
+; CHECK: orr w1, w2, w3              ; encoding: [0x41,0x00,0x03,0x2a]
+; CHECK: orr x1, x2, x3              ; encoding: [0x41,0x00,0x03,0xaa]
+; CHECK: orr w1, w2, w3, lsl #6      ; encoding: [0x41,0x18,0x03,0x2a]
+; CHECK: orr x1, x2, x3, lsl #6      ; encoding: [0x41,0x18,0x03,0xaa]
+; CHECK: orr w1, w2, w3, lsr #6      ; encoding: [0x41,0x18,0x43,0x2a]
+; CHECK: orr x1, x2, x3, lsr #6      ; encoding: [0x41,0x18,0x43,0xaa]
+; CHECK: orr w1, w2, w3, asr #6      ; encoding: [0x41,0x18,0x83,0x2a]
+; CHECK: orr x1, x2, x3, asr #6      ; encoding: [0x41,0x18,0x83,0xaa]
+; CHECK: orr w1, w2, w3, ror #6      ; encoding: [0x41,0x18,0xc3,0x2a]
+; CHECK: orr x1, x2, x3, ror #6      ; encoding: [0x41,0x18,0xc3,0xaa]
+
+  orn w1, w2, w3
+  orn x1, x2, x3
+  orn w1, w2, w3, lsl #7
+  orn x1, x2, x3, lsl #7
+  orn w1, w2, w3, lsr #7
+  orn x1, x2, x3, lsr #7
+  orn w1, w2, w3, asr #7
+  orn x1, x2, x3, asr #7
+  orn w1, w2, w3, ror #7
+  orn x1, x2, x3, ror #7
+
+; CHECK: orn w1, w2, w3              ; encoding: [0x41,0x00,0x23,0x2a]
+; CHECK: orn x1, x2, x3              ; encoding: [0x41,0x00,0x23,0xaa]
+; CHECK: orn w1, w2, w3, lsl #7      ; encoding: [0x41,0x1c,0x23,0x2a]
+; CHECK: orn x1, x2, x3, lsl #7      ; encoding: [0x41,0x1c,0x23,0xaa]
+; CHECK: orn w1, w2, w3, lsr #7      ; encoding: [0x41,0x1c,0x63,0x2a]
+; CHECK: orn x1, x2, x3, lsr #7      ; encoding: [0x41,0x1c,0x63,0xaa]
+; CHECK: orn w1, w2, w3, asr #7      ; encoding: [0x41,0x1c,0xa3,0x2a]
+; CHECK: orn x1, x2, x3, asr #7      ; encoding: [0x41,0x1c,0xa3,0xaa]
+; CHECK: orn w1, w2, w3, ror #7      ; encoding: [0x41,0x1c,0xe3,0x2a]
+; CHECK: orn x1, x2, x3, ror #7      ; encoding: [0x41,0x1c,0xe3,0xaa]
diff --git a/test/MC/ARM64/mapping-across-sections.s b/test/MC/ARM64/mapping-across-sections.s
new file mode 100644
index 0000000000..00b324cb82
--- /dev/null
+++ b/test/MC/ARM64/mapping-across-sections.s
@@ -0,0 +1,28 @@
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+
+        .text
+        add w0, w0, w0
+
+// .wibble should *not* inherit .text's mapping symbol. It's a completely different section.
+        .section .wibble
+        add w0, w0, w0
+
+// A setion should be able to start with a $d
+        .section .starts_data
+        .word 42
+
+// Changing back to .text should not emit a redundant $x
+        .text
+        add w0, w0, w0
+
+// With all those constraints, we want:
+//   + .text to have $x at 0 and no others
+//   + .wibble to have $x at 0
+//   + .starts_data to have $d at 0
+
+
+// CHECK: 00000000 .starts_data 00000000 $d
+// CHECK-NEXT: 00000000 .text 00000000 $x
+// CHECK-NEXT: 00000000 .wibble 00000000 $x
+// CHECK-NOT: ${{[adtx]}}
+
diff --git a/test/MC/ARM64/mapping-within-section.s b/test/MC/ARM64/mapping-within-section.s
new file mode 100644
index 0000000000..f515cb9a5c
--- /dev/null
+++ b/test/MC/ARM64/mapping-within-section.s
@@ -0,0 +1,23 @@
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s | llvm-objdump -t - | FileCheck %s
+
+    .text
+// $x at 0x0000
+    add w0, w0, w0
+// $d at 0x0004
+    .ascii "012"
+    .byte 1
+    .hword 2
+    .word 4
+    .xword 8
+    .single 4.0
+    .double 8.0
+    .space 10
+    .zero 3
+    .fill 10, 2, 42
+    .org 100, 12
+// $x at 0x0018
+    add x0, x0, x0
+
+// CHECK: 00000004         .text  00000000 $d
+// CHECK-NEXT: 00000000         .text  00000000 $x
+// CHECK-NEXT: 00000064         .text  00000000 $x
diff --git a/test/MC/ARM64/memory.s b/test/MC/ARM64/memory.s
new file mode 100644
index 0000000000..0e8f1d5008
--- /dev/null
+++ b/test/MC/ARM64/memory.s
@@ -0,0 +1,634 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+foo:
+;-----------------------------------------------------------------------------
+; Indexed loads
+;-----------------------------------------------------------------------------
+
+  ldr    w5, [x4, #20]
+  ldr    x4, [x3]
+  ldr    x2, [sp, #32]
+  ldr    b5, [sp, #1]
+  ldr    h6, [sp, #2]
+  ldr    s7, [sp, #4]
+  ldr    d8, [sp, #8]
+  ldr    q9, [sp, #16]
+  ldrb   w4, [x3]
+  ldrb   w5, [x4, #20]
+  ldrb	 w2, [x3, _foo@pageoff]
+  ldrb   w3, [x2, "+[Test method].var"@PAGEOFF]
+  ldrsb  w9, [x3]
+  ldrsb  x2, [sp, #128]
+  ldrh   w2, [sp, #32]
+  ldrsh  w3, [sp, #32]
+  ldrsh  x5, [x9, #24]
+  ldrsw  x9, [sp, #512]
+
+  prfm   #5, [sp, #32]
+  prfm   #31, [sp, #32]
+  prfm   pldl1keep, [x2]
+  prfm   pldl1strm, [x2]
+  prfm   pldl2keep, [x2]
+  prfm   pldl2strm, [x2]
+  prfm   pldl3keep, [x2]
+  prfm   pldl3strm, [x2]
+  prfm   pstl1keep, [x2]
+  prfm   pstl1strm, [x2]
+  prfm   pstl2keep, [x2]
+  prfm   pstl2strm, [x2]
+  prfm   pstl3keep, [x2]
+  prfm   pstl3strm, [x2]
+  prfm  pstl3strm, [x4, x5, lsl #3]
+
+; CHECK: ldr    w5, [x4, #20]           ; encoding: [0x85,0x14,0x40,0xb9]
+; CHECK: ldr    x4, [x3]                ; encoding: [0x64,0x00,0x40,0xf9]
+; CHECK: ldr    x2, [sp, #32]           ; encoding: [0xe2,0x13,0x40,0xf9]
+; CHECK: ldr    b5, [sp, #1]            ; encoding: [0xe5,0x07,0x40,0x3d]
+; CHECK: ldr    h6, [sp, #2]            ; encoding: [0xe6,0x07,0x40,0x7d]
+; CHECK: ldr    s7, [sp, #4]            ; encoding: [0xe7,0x07,0x40,0xbd]
+; CHECK: ldr    d8, [sp, #8]            ; encoding: [0xe8,0x07,0x40,0xfd]
+; CHECK: ldr    q9, [sp, #16]           ; encoding: [0xe9,0x07,0xc0,0x3d]
+; CHECK: ldrb   w4, [x3]                ; encoding: [0x64,0x00,0x40,0x39]
+; CHECK: ldrb   w5, [x4, #20]           ; encoding: [0x85,0x50,0x40,0x39]
+; CHECK: ldrb	w2, [x3, _foo@PAGEOFF]  ; encoding: [0x62,0bAAAAAA00,0b01AAAAAA,0x39]
+; CHECK: ldrb	w3, [x2, "+[Test method].var"@PAGEOFF] ; encoding: [0x43,0bAAAAAA00,0b01AAAAAA,0x39]
+; CHECK: ldrsb  w9, [x3]                ; encoding: [0x69,0x00,0xc0,0x39]
+; CHECK: ldrsb  x2, [sp, #128]          ; encoding: [0xe2,0x03,0x82,0x39]
+; CHECK: ldrh   w2, [sp, #32]           ; encoding: [0xe2,0x43,0x40,0x79]
+; CHECK: ldrsh  w3, [sp, #32]           ; encoding: [0xe3,0x43,0xc0,0x79]
+; CHECK: ldrsh  x5, [x9, #24]           ; encoding: [0x25,0x31,0x80,0x79]
+; CHECK: ldrsw  x9, [sp, #512]          ; encoding: [0xe9,0x03,0x82,0xb9]
+; CHECK: prfm   pldl3strm, [sp, #32]    ; encoding: [0xe5,0x13,0x80,0xf9]
+; CHECK: prfm	#31, [sp, #32]          ; encoding: [0xff,0x13,0x80,0xf9]
+; CHECK: prfm   pldl1keep, [x2]         ; encoding: [0x40,0x00,0x80,0xf9]
+; CHECK: prfm   pldl1strm, [x2]         ; encoding: [0x41,0x00,0x80,0xf9]
+; CHECK: prfm   pldl2keep, [x2]         ; encoding: [0x42,0x00,0x80,0xf9]
+; CHECK: prfm   pldl2strm, [x2]         ; encoding: [0x43,0x00,0x80,0xf9]
+; CHECK: prfm   pldl3keep, [x2]         ; encoding: [0x44,0x00,0x80,0xf9]
+; CHECK: prfm   pldl3strm, [x2]         ; encoding: [0x45,0x00,0x80,0xf9]
+; CHECK: prfm   pstl1keep, [x2]         ; encoding: [0x50,0x00,0x80,0xf9]
+; CHECK: prfm   pstl1strm, [x2]         ; encoding: [0x51,0x00,0x80,0xf9]
+; CHECK: prfm   pstl2keep, [x2]         ; encoding: [0x52,0x00,0x80,0xf9]
+; CHECK: prfm   pstl2strm, [x2]         ; encoding: [0x53,0x00,0x80,0xf9]
+; CHECK: prfm   pstl3keep, [x2]         ; encoding: [0x54,0x00,0x80,0xf9]
+; CHECK: prfm   pstl3strm, [x2]         ; encoding: [0x55,0x00,0x80,0xf9]
+; CHECK: prfm	pstl3strm, [x4, x5, lsl #3] ; encoding: [0x95,0x78,0xa5,0xf8]
+
+;-----------------------------------------------------------------------------
+; Indexed stores
+;-----------------------------------------------------------------------------
+
+  str   x4, [x3]
+  str   x2, [sp, #32]
+  str   w5, [x4, #20]
+  str   b5, [sp, #1]
+  str   h6, [sp, #2]
+  str   s7, [sp, #4]
+  str   d8, [sp, #8]
+  str   q9, [sp, #16]
+  strb  w4, [x3]
+  strb  w5, [x4, #20]
+  strh  w2, [sp, #32]
+
+; CHECK: str   x4, [x3]                 ; encoding: [0x64,0x00,0x00,0xf9]
+; CHECK: str   x2, [sp, #32]            ; encoding: [0xe2,0x13,0x00,0xf9]
+; CHECK: str   w5, [x4, #20]            ; encoding: [0x85,0x14,0x00,0xb9]
+; CHECK: str   b5, [sp, #1]             ; encoding: [0xe5,0x07,0x00,0x3d]
+; CHECK: str   h6, [sp, #2]             ; encoding: [0xe6,0x07,0x00,0x7d]
+; CHECK: str   s7, [sp, #4]             ; encoding: [0xe7,0x07,0x00,0xbd]
+; CHECK: str   d8, [sp, #8]             ; encoding: [0xe8,0x07,0x00,0xfd]
+; CHECK: str   q9, [sp, #16]            ; encoding: [0xe9,0x07,0x80,0x3d]
+; CHECK: strb  w4, [x3]                 ; encoding: [0x64,0x00,0x00,0x39]
+; CHECK: strb  w5, [x4, #20]            ; encoding: [0x85,0x50,0x00,0x39]
+; CHECK: strh  w2, [sp, #32]            ; encoding: [0xe2,0x43,0x00,0x79]
+
+;-----------------------------------------------------------------------------
+; Unscaled immediate loads and stores
+;-----------------------------------------------------------------------------
+
+  ldur    w2, [x3]
+  ldur    w2, [sp, #24]
+  ldur    x2, [x3]
+  ldur    x2, [sp, #24]
+  ldur    b5, [sp, #1]
+  ldur    h6, [sp, #2]
+  ldur    s7, [sp, #4]
+  ldur    d8, [sp, #8]
+  ldur    q9, [sp, #16]
+  ldursb  w9, [x3]
+  ldursb  x2, [sp, #128]
+  ldursh  w3, [sp, #32]
+  ldursh  x5, [x9, #24]
+  ldursw  x9, [sp, #-128]
+
+; CHECK: ldur    w2, [x3]               ; encoding: [0x62,0x00,0x40,0xb8]
+; CHECK: ldur    w2, [sp, #24]          ; encoding: [0xe2,0x83,0x41,0xb8]
+; CHECK: ldur    x2, [x3]               ; encoding: [0x62,0x00,0x40,0xf8]
+; CHECK: ldur    x2, [sp, #24]          ; encoding: [0xe2,0x83,0x41,0xf8]
+; CHECK: ldur    b5, [sp, #1]           ; encoding: [0xe5,0x13,0x40,0x3c]
+; CHECK: ldur    h6, [sp, #2]           ; encoding: [0xe6,0x23,0x40,0x7c]
+; CHECK: ldur    s7, [sp, #4]           ; encoding: [0xe7,0x43,0x40,0xbc]
+; CHECK: ldur    d8, [sp, #8]           ; encoding: [0xe8,0x83,0x40,0xfc]
+; CHECK: ldur    q9, [sp, #16]          ; encoding: [0xe9,0x03,0xc1,0x3c]
+; CHECK: ldursb  w9, [x3]               ; encoding: [0x69,0x00,0xc0,0x38]
+; CHECK: ldursb  x2, [sp, #128]         ; encoding: [0xe2,0x03,0x88,0x38]
+; CHECK: ldursh  w3, [sp, #32]          ; encoding: [0xe3,0x03,0xc2,0x78]
+; CHECK: ldursh  x5, [x9, #24]          ; encoding: [0x25,0x81,0x81,0x78]
+; CHECK: ldursw  x9, [sp, #-128]        ; encoding: [0xe9,0x03,0x98,0xb8]
+
+  stur    w4, [x3]
+  stur    w2, [sp, #32]
+  stur    x4, [x3]
+  stur    x2, [sp, #32]
+  stur    w5, [x4, #20]
+  stur    b5, [sp, #1]
+  stur    h6, [sp, #2]
+  stur    s7, [sp, #4]
+  stur    d8, [sp, #8]
+  stur    q9, [sp, #16]
+  sturb   w4, [x3]
+  sturb   w5, [x4, #20]
+  sturh   w2, [sp, #32]
+  prfum   #5, [sp, #32]
+
+; CHECK: stur    w4, [x3]               ; encoding: [0x64,0x00,0x00,0xb8]
+; CHECK: stur    w2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0xb8]
+; CHECK: stur    x4, [x3]               ; encoding: [0x64,0x00,0x00,0xf8]
+; CHECK: stur    x2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0xf8]
+; CHECK: stur    w5, [x4, #20]          ; encoding: [0x85,0x40,0x01,0xb8]
+; CHECK: stur    b5, [sp, #1]           ; encoding: [0xe5,0x13,0x00,0x3c]
+; CHECK: stur    h6, [sp, #2]           ; encoding: [0xe6,0x23,0x00,0x7c]
+; CHECK: stur    s7, [sp, #4]           ; encoding: [0xe7,0x43,0x00,0xbc]
+; CHECK: stur    d8, [sp, #8]           ; encoding: [0xe8,0x83,0x00,0xfc]
+; CHECK: stur    q9, [sp, #16]          ; encoding: [0xe9,0x03,0x81,0x3c]
+; CHECK: sturb   w4, [x3]               ; encoding: [0x64,0x00,0x00,0x38]
+; CHECK: sturb   w5, [x4, #20]          ; encoding: [0x85,0x40,0x01,0x38]
+; CHECK: sturh   w2, [sp, #32]          ; encoding: [0xe2,0x03,0x02,0x78]
+; CHECK: prfum   pldl3strm, [sp, #32]   ; encoding: [0xe5,0x03,0x82,0xf8]
+
+;-----------------------------------------------------------------------------
+; Unprivileged loads and stores
+;-----------------------------------------------------------------------------
+
+  ldtr    w3, [x4, #16]
+  ldtr    x3, [x4, #16]
+  ldtrb   w3, [x4, #16]
+  ldtrsb  w9, [x3]
+  ldtrsb  x2, [sp, #128]
+  ldtrh   w3, [x4, #16]
+  ldtrsh  w3, [sp, #32]
+  ldtrsh  x5, [x9, #24]
+  ldtrsw  x9, [sp, #-128]
+
+; CHECK: ldtr   w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0xb8]
+; CHECK: ldtr   x3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0xf8]
+; CHECK: ldtrb  w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0x38]
+; CHECK: ldtrsb w9, [x3]                ; encoding: [0x69,0x08,0xc0,0x38]
+; CHECK: ldtrsb x2, [sp, #128]          ; encoding: [0xe2,0x0b,0x88,0x38]
+; CHECK: ldtrh  w3, [x4, #16]           ; encoding: [0x83,0x08,0x41,0x78]
+; CHECK: ldtrsh w3, [sp, #32]           ; encoding: [0xe3,0x0b,0xc2,0x78]
+; CHECK: ldtrsh x5, [x9, #24]           ; encoding: [0x25,0x89,0x81,0x78]
+; CHECK: ldtrsw x9, [sp, #-128]         ; encoding: [0xe9,0x0b,0x98,0xb8]
+
+  sttr    w5, [x4, #20]
+  sttr    x4, [x3]
+  sttr    x2, [sp, #32]
+  sttrb   w4, [x3]
+  sttrb   w5, [x4, #20]
+  sttrh   w2, [sp, #32]
+
+; CHECK: sttr   w5, [x4, #20]           ; encoding: [0x85,0x48,0x01,0xb8]
+; CHECK: sttr   x4, [x3]                ; encoding: [0x64,0x08,0x00,0xf8]
+; CHECK: sttr   x2, [sp, #32]           ; encoding: [0xe2,0x0b,0x02,0xf8]
+; CHECK: sttrb  w4, [x3]                ; encoding: [0x64,0x08,0x00,0x38]
+; CHECK: sttrb  w5, [x4, #20]           ; encoding: [0x85,0x48,0x01,0x38]
+; CHECK: sttrh  w2, [sp, #32]           ; encoding: [0xe2,0x0b,0x02,0x78]
+
+;-----------------------------------------------------------------------------
+; Pre-indexed loads and stores
+;-----------------------------------------------------------------------------
+
+  ldr   fp, [x7, #8]!
+  ldr   lr, [x7, #8]!
+  ldr   b5, [x0, #1]!
+  ldr   h6, [x0, #2]!
+  ldr   s7, [x0, #4]!
+  ldr   d8, [x0, #8]!
+  ldr   q9, [x0, #16]!
+
+  str   lr, [x7, #-8]!
+  str   fp, [x7, #-8]!
+  str   b5, [x0, #-1]!
+  str   h6, [x0, #-2]!
+  str   s7, [x0, #-4]!
+  str   d8, [x0, #-8]!
+  str   q9, [x0, #-16]!
+
+; CHECK: ldr  fp, [x7, #8]!             ; encoding: [0xfd,0x8c,0x40,0xf8]
+; CHECK: ldr  lr, [x7, #8]!             ; encoding: [0xfe,0x8c,0x40,0xf8]
+; CHECK: ldr  b5, [x0, #1]!             ; encoding: [0x05,0x1c,0x40,0x3c]
+; CHECK: ldr  h6, [x0, #2]!             ; encoding: [0x06,0x2c,0x40,0x7c]
+; CHECK: ldr  s7, [x0, #4]!             ; encoding: [0x07,0x4c,0x40,0xbc]
+; CHECK: ldr  d8, [x0, #8]!             ; encoding: [0x08,0x8c,0x40,0xfc]
+; CHECK: ldr  q9, [x0, #16]!            ; encoding: [0x09,0x0c,0xc1,0x3c]
+
+; CHECK: str  lr, [x7, #-8]!            ; encoding: [0xfe,0x8c,0x1f,0xf8]
+; CHECK: str  fp, [x7, #-8]!            ; encoding: [0xfd,0x8c,0x1f,0xf8]
+; CHECK: str  b5, [x0, #-1]!            ; encoding: [0x05,0xfc,0x1f,0x3c]
+; CHECK: str  h6, [x0, #-2]!            ; encoding: [0x06,0xec,0x1f,0x7c]
+; CHECK: str  s7, [x0, #-4]!            ; encoding: [0x07,0xcc,0x1f,0xbc]
+; CHECK: str  d8, [x0, #-8]!            ; encoding: [0x08,0x8c,0x1f,0xfc]
+; CHECK: str  q9, [x0, #-16]!           ; encoding: [0x09,0x0c,0x9f,0x3c]
+
+;-----------------------------------------------------------------------------
+; post-indexed loads and stores
+;-----------------------------------------------------------------------------
+  str lr, [x7], #-8
+  str fp, [x7], #-8
+  str b5, [x0], #-1
+  str h6, [x0], #-2
+  str s7, [x0], #-4
+  str d8, [x0], #-8
+  str q9, [x0], #-16
+
+  ldr fp, [x7], #8
+  ldr lr, [x7], #8
+  ldr b5, [x0], #1
+  ldr h6, [x0], #2
+  ldr s7, [x0], #4
+  ldr d8, [x0], #8
+  ldr q9, [x0], #16
+
+; CHECK: str lr, [x7], #-8             ; encoding: [0xfe,0x84,0x1f,0xf8]
+; CHECK: str fp, [x7], #-8             ; encoding: [0xfd,0x84,0x1f,0xf8]
+; CHECK: str b5, [x0], #-1             ; encoding: [0x05,0xf4,0x1f,0x3c]
+; CHECK: str h6, [x0], #-2             ; encoding: [0x06,0xe4,0x1f,0x7c]
+; CHECK: str s7, [x0], #-4             ; encoding: [0x07,0xc4,0x1f,0xbc]
+; CHECK: str d8, [x0], #-8             ; encoding: [0x08,0x84,0x1f,0xfc]
+; CHECK: str q9, [x0], #-16            ; encoding: [0x09,0x04,0x9f,0x3c]
+
+; CHECK: ldr fp, [x7], #8              ; encoding: [0xfd,0x84,0x40,0xf8]
+; CHECK: ldr lr, [x7], #8              ; encoding: [0xfe,0x84,0x40,0xf8]
+; CHECK: ldr b5, [x0], #1              ; encoding: [0x05,0x14,0x40,0x3c]
+; CHECK: ldr h6, [x0], #2              ; encoding: [0x06,0x24,0x40,0x7c]
+; CHECK: ldr s7, [x0], #4              ; encoding: [0x07,0x44,0x40,0xbc]
+; CHECK: ldr d8, [x0], #8              ; encoding: [0x08,0x84,0x40,0xfc]
+; CHECK: ldr q9, [x0], #16             ; encoding: [0x09,0x04,0xc1,0x3c]
+
+;-----------------------------------------------------------------------------
+; Load/Store pair (indexed, offset)
+;-----------------------------------------------------------------------------
+
+  ldp    w3, w2, [x15, #16]
+  ldp    x4, x9, [sp, #-16]
+  ldpsw  x2, x3, [x14, #16]
+  ldpsw  x2, x3, [sp, #-16]
+  ldp    s10, s1, [x2, #64]
+  ldp    d10, d1, [x2]
+  ldp    q2, q3, [x0, #32]
+
+; CHECK: ldp    w3, w2, [x15, #16]      ; encoding: [0xe3,0x09,0x42,0x29]
+; CHECK: ldp    x4, x9, [sp, #-16]      ; encoding: [0xe4,0x27,0x7f,0xa9]
+; CHECK: ldpsw  x2, x3, [x14, #16]      ; encoding: [0xc2,0x0d,0x42,0x69]
+; CHECK: ldpsw  x2, x3, [sp, #-16]      ; encoding: [0xe2,0x0f,0x7e,0x69]
+; CHECK: ldp    s10, s1, [x2, #64]      ; encoding: [0x4a,0x04,0x48,0x2d]
+; CHECK: ldp    d10, d1, [x2]           ; encoding: [0x4a,0x04,0x40,0x6d]
+; CHECK: ldp    q2, q3, [x0, #32]       ; encoding: [0x02,0x0c,0x41,0xad]
+
+  stp    w3, w2, [x15, #16]
+  stp    x4, x9, [sp, #-16]
+  stp    s10, s1, [x2, #64]
+  stp    d10, d1, [x2]
+  stp    q2, q3, [x0, #32]
+
+; CHECK: stp    w3, w2, [x15, #16]      ; encoding: [0xe3,0x09,0x02,0x29]
+; CHECK: stp    x4, x9, [sp, #-16]      ; encoding: [0xe4,0x27,0x3f,0xa9]
+; CHECK: stp    s10, s1, [x2, #64]      ; encoding: [0x4a,0x04,0x08,0x2d]
+; CHECK: stp    d10, d1, [x2]           ; encoding: [0x4a,0x04,0x00,0x6d]
+; CHECK: stp    q2, q3, [x0, #32]       ; encoding: [0x02,0x0c,0x01,0xad]
+
+;-----------------------------------------------------------------------------
+; Load/Store pair (pre-indexed)
+;-----------------------------------------------------------------------------
+
+  ldp    w3, w2, [x15, #16]!
+  ldp    x4, x9, [sp, #-16]!
+  ldpsw  x2, x3, [x14, #16]!
+  ldpsw  x2, x3, [sp, #-16]!
+  ldp    s10, s1, [x2, #64]!
+  ldp    d10, d1, [x2, #16]!
+
+; CHECK: ldp  w3, w2, [x15, #16]!       ; encoding: [0xe3,0x09,0xc2,0x29]
+; CHECK: ldp  x4, x9, [sp, #-16]!       ; encoding: [0xe4,0x27,0xff,0xa9]
+; CHECK: ldpsw	x2, x3, [x14, #16]!     ; encoding: [0xc2,0x0d,0xc2,0x69]
+; CHECK: ldpsw	x2, x3, [sp, #-16]!     ; encoding: [0xe2,0x0f,0xfe,0x69]
+; CHECK: ldp  s10, s1, [x2, #64]!       ; encoding: [0x4a,0x04,0xc8,0x2d]
+; CHECK: ldp  d10, d1, [x2, #16]!       ; encoding: [0x4a,0x04,0xc1,0x6d]
+
+  stp    w3, w2, [x15, #16]!
+  stp    x4, x9, [sp, #-16]!
+  stp    s10, s1, [x2, #64]!
+  stp    d10, d1, [x2, #16]!
+
+; CHECK: stp  w3, w2, [x15, #16]!       ; encoding: [0xe3,0x09,0x82,0x29]
+; CHECK: stp  x4, x9, [sp, #-16]!       ; encoding: [0xe4,0x27,0xbf,0xa9]
+; CHECK: stp  s10, s1, [x2, #64]!       ; encoding: [0x4a,0x04,0x88,0x2d]
+; CHECK: stp  d10, d1, [x2, #16]!       ; encoding: [0x4a,0x04,0x81,0x6d]
+
+;-----------------------------------------------------------------------------
+; Load/Store pair (post-indexed)
+;-----------------------------------------------------------------------------
+
+  ldp    w3, w2, [x15], #16
+  ldp    x4, x9, [sp], #-16
+  ldpsw  x2, x3, [x14], #16
+  ldpsw  x2, x3, [sp], #-16
+  ldp    s10, s1, [x2], #64
+  ldp    d10, d1, [x2], #16
+
+; CHECK: ldp  w3, w2, [x15], #16        ; encoding: [0xe3,0x09,0xc2,0x28]
+; CHECK: ldp  x4, x9, [sp], #-16        ; encoding: [0xe4,0x27,0xff,0xa8]
+; CHECK: ldpsw	x2, x3, [x14], #16      ; encoding: [0xc2,0x0d,0xc2,0x68]
+; CHECK: ldpsw	x2, x3, [sp], #-16      ; encoding: [0xe2,0x0f,0xfe,0x68]
+; CHECK: ldp  s10, s1, [x2], #64        ; encoding: [0x4a,0x04,0xc8,0x2c]
+; CHECK: ldp  d10, d1, [x2], #16        ; encoding: [0x4a,0x04,0xc1,0x6c]
+
+  stp    w3, w2, [x15], #16
+  stp    x4, x9, [sp], #-16
+  stp    s10, s1, [x2], #64
+  stp    d10, d1, [x2], #16
+
+; CHECK: stp  w3, w2, [x15], #16        ; encoding: [0xe3,0x09,0x82,0x28]
+; CHECK: stp  x4, x9, [sp], #-16        ; encoding: [0xe4,0x27,0xbf,0xa8]
+; CHECK: stp  s10, s1, [x2], #64        ; encoding: [0x4a,0x04,0x88,0x2c]
+; CHECK: stp  d10, d1, [x2], #16        ; encoding: [0x4a,0x04,0x81,0x6c]
+
+;-----------------------------------------------------------------------------
+; Load/Store pair (no-allocate)
+;-----------------------------------------------------------------------------
+
+  ldnp  w3, w2, [x15, #16]
+  ldnp  x4, x9, [sp, #-16]
+  ldnp  s10, s1, [x2, #64]
+  ldnp  d10, d1, [x2]
+
+; CHECK: ldnp  w3, w2, [x15, #16]       ; encoding: [0xe3,0x09,0x42,0x28]
+; CHECK: ldnp  x4, x9, [sp, #-16]       ; encoding: [0xe4,0x27,0x7f,0xa8]
+; CHECK: ldnp  s10, s1, [x2, #64]       ; encoding: [0x4a,0x04,0x48,0x2c]
+; CHECK: ldnp  d10, d1, [x2]            ; encoding: [0x4a,0x04,0x40,0x6c]
+
+  stnp  w3, w2, [x15, #16]
+  stnp  x4, x9, [sp, #-16]
+  stnp  s10, s1, [x2, #64]
+  stnp  d10, d1, [x2]
+
+; CHECK: stnp  w3, w2, [x15, #16]       ; encoding: [0xe3,0x09,0x02,0x28]
+; CHECK: stnp  x4, x9, [sp, #-16]       ; encoding: [0xe4,0x27,0x3f,0xa8]
+; CHECK: stnp  s10, s1, [x2, #64]       ; encoding: [0x4a,0x04,0x08,0x2c]
+; CHECK: stnp  d10, d1, [x2]            ; encoding: [0x4a,0x04,0x00,0x6c]
+
+;-----------------------------------------------------------------------------
+; Load/Store register offset
+;-----------------------------------------------------------------------------
+
+  ldr  w0, [x0, x0]
+  ldr  w0, [x0, x0, lsl #2]
+  ldr  x0, [x0, x0]
+  ldr  x0, [x0, x0, lsl #3]
+  ldr  x0, [x0, x0, sxtx]
+
+; CHECK: ldr  w0, [x0, x0]              ; encoding: [0x00,0x68,0x60,0xb8]
+; CHECK: ldr  w0, [x0, x0, lsl #2]      ; encoding: [0x00,0x78,0x60,0xb8]
+; CHECK: ldr  x0, [x0, x0]              ; encoding: [0x00,0x68,0x60,0xf8]
+; CHECK: ldr  x0, [x0, x0, lsl #3]      ; encoding: [0x00,0x78,0x60,0xf8]
+; CHECK: ldr  x0, [x0, x0, sxtx]        ; encoding: [0x00,0xe8,0x60,0xf8]
+
+  ldr  b1, [x1, x2]
+  ldr  b1, [x1, x2, lsl #0]
+  ldr  h1, [x1, x2]
+  ldr  h1, [x1, x2, lsl #1]
+  ldr  s1, [x1, x2]
+  ldr  s1, [x1, x2, lsl #2]
+  ldr  d1, [x1, x2]
+  ldr  d1, [x1, x2, lsl #3]
+  ldr  q1, [x1, x2]
+  ldr  q1, [x1, x2, lsl #4]
+
+; CHECK: ldr  b1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0x3c]
+; CHECK: ldr  b1, [x1, x2, lsl #0]      ; encoding: [0x21,0x78,0x62,0x3c]
+; CHECK: ldr  h1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0x7c]
+; CHECK: ldr  h1, [x1, x2, lsl #1]      ; encoding: [0x21,0x78,0x62,0x7c]
+; CHECK: ldr  s1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0xbc]
+; CHECK: ldr  s1, [x1, x2, lsl #2]      ; encoding: [0x21,0x78,0x62,0xbc]
+; CHECK: ldr  d1, [x1, x2]              ; encoding: [0x21,0x68,0x62,0xfc]
+; CHECK: ldr  d1, [x1, x2, lsl #3]      ; encoding: [0x21,0x78,0x62,0xfc]
+; CHECK: ldr  q1, [x1, x2]              ; encoding: [0x21,0x68,0xe2,0x3c]
+; CHECK: ldr  q1, [x1, x2, lsl #4]      ; encoding: [0x21,0x78,0xe2,0x3c]
+
+  str  d1, [sp, x3]
+  str  d1, [sp, x3, uxtw #3]
+  str  q1, [sp, x3]
+  str  q1, [sp, x3, uxtw #4]
+
+; CHECK: str  d1, [sp, x3]              ; encoding: [0xe1,0x6b,0x23,0xfc]
+; CHECK: str  d1, [sp, x3, uxtw #3]     ; encoding: [0xe1,0x5b,0x23,0xfc]
+; CHECK: str  q1, [sp, x3]              ; encoding: [0xe1,0x6b,0xa3,0x3c]
+; CHECK: str  q1, [sp, x3, uxtw #4]     ; encoding: [0xe1,0x5b,0xa3,0x3c]
+
+;-----------------------------------------------------------------------------
+; Load literal
+;-----------------------------------------------------------------------------
+
+  ldr    w5, foo
+  ldr    x4, foo
+  ldrsw  x9, foo
+  prfm   #5, foo
+
+; CHECK: ldr    w5, foo                 ; encoding: [0bAAA00101,A,A,0x18]
+; CHECK: ldr    x4, foo                 ; encoding: [0bAAA00100,A,A,0x58]
+; CHECK: ldrsw  x9, foo                 ; encoding: [0bAAA01001,A,A,0x98]
+; CHECK: prfm   pldl3strm, foo          ; encoding: [0bAAA00101,A,A,0xd8]
+
+;-----------------------------------------------------------------------------
+; Load/Store exclusive
+;-----------------------------------------------------------------------------
+
+  ldxr   w6, [x1]
+  ldxr   x6, [x1]
+  ldxrb  w6, [x1]
+  ldxrh  w6, [x1]
+  ldxp   w7, w3, [x9]
+  ldxp   x7, x3, [x9]
+
+; CHECK: ldxrb  w6, [x1]                ; encoding: [0x26,0x7c,0x5f,0x08]
+; CHECK: ldxrh  w6, [x1]                ; encoding: [0x26,0x7c,0x5f,0x48]
+; CHECK: ldxp   w7, w3, [x9]            ; encoding: [0x27,0x0d,0x7f,0x88]
+; CHECK: ldxp   x7, x3, [x9]            ; encoding: [0x27,0x0d,0x7f,0xc8]
+
+  stxr   w1, x4, [x3]
+  stxr   w1, w4, [x3]
+  stxrb  w1, w4, [x3]
+  stxrh  w1, w4, [x3]
+  stxp   w1, x2, x6, [x1]
+  stxp   w1, w2, w6, [x1]
+
+; CHECK: stxr   w1, x4, [x3]            ; encoding: [0x64,0x7c,0x01,0xc8]
+; CHECK: stxr   w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x88]
+; CHECK: stxrb  w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x08]
+; CHECK: stxrh  w1, w4, [x3]            ; encoding: [0x64,0x7c,0x01,0x48]
+; CHECK: stxp   w1, x2, x6, [x1]        ; encoding: [0x22,0x18,0x21,0xc8]
+; CHECK: stxp   w1, w2, w6, [x1]        ; encoding: [0x22,0x18,0x21,0x88]
+
+;-----------------------------------------------------------------------------
+; Load-acquire/Store-release non-exclusive
+;-----------------------------------------------------------------------------
+
+  ldar   w4, [sp]
+  ldar   x4, [sp, #0]
+  ldarb  w4, [sp]
+  ldarh  w4, [sp]
+
+; CHECK: ldar   w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x88]
+; CHECK: ldar   x4, [sp]                ; encoding: [0xe4,0xff,0xdf,0xc8]
+; CHECK: ldarb  w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x08]
+; CHECK: ldarh  w4, [sp]                ; encoding: [0xe4,0xff,0xdf,0x48]
+
+  stlr   w3, [x6]
+  stlr   x3, [x6]
+  stlrb  w3, [x6]
+  stlrh  w3, [x6]
+
+; CHECK: stlr   w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x88]
+; CHECK: stlr   x3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0xc8]
+; CHECK: stlrb  w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x08]
+; CHECK: stlrh  w3, [x6]                ; encoding: [0xc3,0xfc,0x9f,0x48]
+
+;-----------------------------------------------------------------------------
+; Load-acquire/Store-release exclusive
+;-----------------------------------------------------------------------------
+
+  ldaxr   w2, [x4]
+  ldaxr   x2, [x4]
+  ldaxrb  w2, [x4, #0]
+  ldaxrh  w2, [x4]
+  ldaxp   w2, w6, [x1]
+  ldaxp   x2, x6, [x1]
+
+; CHECK: ldaxr   w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x88]
+; CHECK: ldaxr   x2, [x4]               ; encoding: [0x82,0xfc,0x5f,0xc8]
+; CHECK: ldaxrb  w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x08]
+; CHECK: ldaxrh  w2, [x4]               ; encoding: [0x82,0xfc,0x5f,0x48]
+; CHECK: ldaxp   w2, w6, [x1]           ; encoding: [0x22,0x98,0x7f,0x88]
+; CHECK: ldaxp   x2, x6, [x1]           ; encoding: [0x22,0x98,0x7f,0xc8]
+
+  stlxr   w8, x7, [x1]
+  stlxr   w8, w7, [x1]
+  stlxrb  w8, w7, [x1]
+  stlxrh  w8, w7, [x1]
+  stlxp   w1, x2, x6, [x1]
+  stlxp   w1, w2, w6, [x1]
+
+; CHECK: stlxr  w8, x7, [x1]            ; encoding: [0x27,0xfc,0x08,0xc8]
+; CHECK: stlxr  w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x88]
+; CHECK: stlxrb w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x08]
+; CHECK: stlxrh w8, w7, [x1]            ; encoding: [0x27,0xfc,0x08,0x48]
+; CHECK: stlxp  w1, x2, x6, [x1]        ; encoding: [0x22,0x98,0x21,0xc8]
+; CHECK: stlxp  w1, w2, w6, [x1]        ; encoding: [0x22,0x98,0x21,0x88]
+
+
+;-----------------------------------------------------------------------------
+; LDUR/STUR aliases for negative and unaligned LDR/STR instructions.
+;
+; According to the ARM ISA documentation:
+; "A programmer-friendly assembler should also generate these instructions
+; in response to the standard LDR/STR mnemonics when the immediate offset is
+; unambiguous, i.e. negative or unaligned."
+;-----------------------------------------------------------------------------
+
+  ldr x11, [fp, #-8]
+  ldr x11, [fp, #7]
+  ldr w0, [x0, #2]
+  ldr w0, [x0, #-256]
+  ldr b2, [x1, #-2]
+  ldr h3, [x2, #3]
+  ldr h3, [x3, #-4]
+  ldr s3, [x4, #3]
+  ldr s3, [x5, #-4]
+  ldr d4, [x6, #4]
+  ldr d4, [x7, #-8]
+  ldr q5, [x8, #8]
+  ldr q5, [x9, #-16]
+
+; CHECK: ldur	x11, [fp, #-8]          ; encoding: [0xab,0x83,0x5f,0xf8]
+; CHECK: ldur	x11, [fp, #7]           ; encoding: [0xab,0x73,0x40,0xf8]
+; CHECK: ldur	w0, [x0, #2]            ; encoding: [0x00,0x20,0x40,0xb8]
+; CHECK: ldur	w0, [x0, #-256]         ; encoding: [0x00,0x00,0x50,0xb8]
+; CHECK: ldur	b2, [x1, #-2]           ; encoding: [0x22,0xe0,0x5f,0x3c]
+; CHECK: ldur	h3, [x2, #3]            ; encoding: [0x43,0x30,0x40,0x7c]
+; CHECK: ldur	h3, [x3, #-4]           ; encoding: [0x63,0xc0,0x5f,0x7c]
+; CHECK: ldur	s3, [x4, #3]            ; encoding: [0x83,0x30,0x40,0xbc]
+; CHECK: ldur	s3, [x5, #-4]           ; encoding: [0xa3,0xc0,0x5f,0xbc]
+; CHECK: ldur	d4, [x6, #4]            ; encoding: [0xc4,0x40,0x40,0xfc]
+; CHECK: ldur	d4, [x7, #-8]           ; encoding: [0xe4,0x80,0x5f,0xfc]
+; CHECK: ldur	q5, [x8, #8]            ; encoding: [0x05,0x81,0xc0,0x3c]
+; CHECK: ldur	q5, [x9, #-16]          ; encoding: [0x25,0x01,0xdf,0x3c]
+
+  str x11, [fp, #-8]
+  str x11, [fp, #7]
+  str w0, [x0, #2]
+  str w0, [x0, #-256]
+  str b2, [x1, #-2]
+  str h3, [x2, #3]
+  str h3, [x3, #-4]
+  str s3, [x4, #3]
+  str s3, [x5, #-4]
+  str d4, [x6, #4]
+  str d4, [x7, #-8]
+  str q5, [x8, #8]
+  str q5, [x9, #-16]
+
+; CHECK: stur	x11, [fp, #-8]          ; encoding: [0xab,0x83,0x1f,0xf8]
+; CHECK: stur	x11, [fp, #7]           ; encoding: [0xab,0x73,0x00,0xf8]
+; CHECK: stur	w0, [x0, #2]            ; encoding: [0x00,0x20,0x00,0xb8]
+; CHECK: stur	w0, [x0, #-256]         ; encoding: [0x00,0x00,0x10,0xb8]
+; CHECK: stur	b2, [x1, #-2]           ; encoding: [0x22,0xe0,0x1f,0x3c]
+; CHECK: stur	h3, [x2, #3]            ; encoding: [0x43,0x30,0x00,0x7c]
+; CHECK: stur	h3, [x3, #-4]           ; encoding: [0x63,0xc0,0x1f,0x7c]
+; CHECK: stur	s3, [x4, #3]            ; encoding: [0x83,0x30,0x00,0xbc]
+; CHECK: stur	s3, [x5, #-4]           ; encoding: [0xa3,0xc0,0x1f,0xbc]
+; CHECK: stur	d4, [x6, #4]            ; encoding: [0xc4,0x40,0x00,0xfc]
+; CHECK: stur	d4, [x7, #-8]           ; encoding: [0xe4,0x80,0x1f,0xfc]
+; CHECK: stur	q5, [x8, #8]            ; encoding: [0x05,0x81,0x80,0x3c]
+; CHECK: stur	q5, [x9, #-16]          ; encoding: [0x25,0x01,0x9f,0x3c]
+
+  ldrb w3, [x1, #-1]
+  ldrh w4, [x2, #1]
+  ldrh w5, [x3, #-1]
+  ldrsb w6, [x4, #-1]
+  ldrsb x7, [x5, #-1]
+  ldrsh w8, [x6, #1]
+  ldrsh w9, [x7, #-1]
+  ldrsh x1, [x8, #1]
+  ldrsh x2, [x9, #-1]
+  ldrsw x3, [x10, #10]
+  ldrsw x4, [x11, #-1]
+
+; CHECK: ldurb	w3, [x1, #-1]           ; encoding: [0x23,0xf0,0x5f,0x38]
+; CHECK: ldurh	w4, [x2, #1]            ; encoding: [0x44,0x10,0x40,0x78]
+; CHECK: ldurh	w5, [x3, #-1]           ; encoding: [0x65,0xf0,0x5f,0x78]
+; CHECK: ldursb	w6, [x4, #-1]           ; encoding: [0x86,0xf0,0xdf,0x38]
+; CHECK: ldursb	x7, [x5, #-1]           ; encoding: [0xa7,0xf0,0x9f,0x38]
+; CHECK: ldursh	w8, [x6, #1]            ; encoding: [0xc8,0x10,0xc0,0x78]
+; CHECK: ldursh	w9, [x7, #-1]           ; encoding: [0xe9,0xf0,0xdf,0x78]
+; CHECK: ldursh	x1, [x8, #1]            ; encoding: [0x01,0x11,0x80,0x78]
+; CHECK: ldursh	x2, [x9, #-1]           ; encoding: [0x22,0xf1,0x9f,0x78]
+; CHECK: ldursw	x3, [x10, #10]          ; encoding: [0x43,0xa1,0x80,0xb8]
+; CHECK: ldursw	x4, [x11, #-1]          ; encoding: [0x64,0xf1,0x9f,0xb8]
+
+  strb w3, [x1, #-1]
+  strh w4, [x2, #1]
+  strh w5, [x3, #-1]
+
+; CHECK: sturb	w3, [x1, #-1]           ; encoding: [0x23,0xf0,0x1f,0x38]
+; CHECK: sturh	w4, [x2, #1]            ; encoding: [0x44,0x10,0x00,0x78]
+; CHECK: sturh	w5, [x3, #-1]           ; encoding: [0x65,0xf0,0x1f,0x78]
diff --git a/test/MC/ARM64/separator.s b/test/MC/ARM64/separator.s
new file mode 100644
index 0000000000..18f34b99a0
--- /dev/null
+++ b/test/MC/ARM64/separator.s
@@ -0,0 +1,20 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -show-encoding < %s | FileCheck %s
+
+; ARM64 uses a multi-character statment separator, "%%". Check that we lex
+; it properly and recognize the multiple assembly statements on the line.
+
+; To make sure the output assembly correctly handled the instructions,
+; tell it to show encodings. That will result in the two 'mov' instructions
+; being on separate lines in the output. We look for the "; encoding" string
+; to verify that. For this test, we don't care what the encoding is, just that
+; there is one for each 'mov' instruction.
+
+
+_foo:
+; CHECK: foo
+; CHECK: mov x0, x1 ; encoding
+; CHECK: mov x1, x0 ; encoding
+	mov x0, x1 %% mov x1, x0
+	ret	lr
+
+
diff --git a/test/MC/ARM64/simd-ldst.s b/test/MC/ARM64/simd-ldst.s
new file mode 100644
index 0000000000..a754c7231e
--- /dev/null
+++ b/test/MC/ARM64/simd-ldst.s
@@ -0,0 +1,2404 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -output-asm-variant=1 -show-encoding < %s | FileCheck %s
+
+_ld1st1_multiple:
+  ld1.8b {v0}, [x1]
+  ld1.8b {v0, v1}, [x1]
+  ld1.8b {v0, v1, v2}, [x1]
+  ld1.8b {v0, v1, v2, v3}, [x1]
+
+  ld1.8b {v3}, [x1]
+  ld1.8b {v3, v4}, [x2]
+  ld1.8b {v4, v5, v6}, [x3]
+  ld1.8b {v7, v8, v9, v10}, [x4]
+
+  ld1.16b {v0}, [x1]
+  ld1.16b {v0, v1}, [x1]
+  ld1.16b {v0, v1, v2}, [x1]
+  ld1.16b {v0, v1, v2, v3}, [x1]
+
+  ld1.4h {v0}, [x1]
+  ld1.4h {v0, v1}, [x1]
+  ld1.4h {v0, v1, v2}, [x1]
+  ld1.4h {v0, v1, v2, v3}, [x1]
+
+  ld1.8h {v0}, [x1]
+  ld1.8h {v0, v1}, [x1]
+  ld1.8h {v0, v1, v2}, [x1]
+  ld1.8h {v0, v1, v2, v3}, [x1]
+
+  ld1.2s {v0}, [x1]
+  ld1.2s {v0, v1}, [x1]
+  ld1.2s {v0, v1, v2}, [x1]
+  ld1.2s {v0, v1, v2, v3}, [x1]
+
+  ld1.4s {v0}, [x1]
+  ld1.4s {v0, v1}, [x1]
+  ld1.4s {v0, v1, v2}, [x1]
+  ld1.4s {v0, v1, v2, v3}, [x1]
+
+  ld1.1d {v0}, [x1]
+  ld1.1d {v0, v1}, [x1]
+  ld1.1d {v0, v1, v2}, [x1]
+  ld1.1d {v0, v1, v2, v3}, [x1]
+
+  ld1.2d {v0}, [x1]
+  ld1.2d {v0, v1}, [x1]
+  ld1.2d {v0, v1, v2}, [x1]
+  ld1.2d {v0, v1, v2, v3}, [x1]
+
+  st1.8b {v0}, [x1]
+  st1.8b {v0, v1}, [x1]
+  st1.8b {v0, v1, v2}, [x1]
+  st1.8b {v0, v1, v2, v3}, [x1]
+
+  st1.16b {v0}, [x1]
+  st1.16b {v0, v1}, [x1]
+  st1.16b {v0, v1, v2}, [x1]
+  st1.16b {v0, v1, v2, v3}, [x1]
+
+  st1.4h {v0}, [x1]
+  st1.4h {v0, v1}, [x1]
+  st1.4h {v0, v1, v2}, [x1]
+  st1.4h {v0, v1, v2, v3}, [x1]
+
+  st1.8h {v0}, [x1]
+  st1.8h {v0, v1}, [x1]
+  st1.8h {v0, v1, v2}, [x1]
+  st1.8h {v0, v1, v2, v3}, [x1]
+
+  st1.2s {v0}, [x1]
+  st1.2s {v0, v1}, [x1]
+  st1.2s {v0, v1, v2}, [x1]
+  st1.2s {v0, v1, v2, v3}, [x1]
+
+  st1.4s {v0}, [x1]
+  st1.4s {v0, v1}, [x1]
+  st1.4s {v0, v1, v2}, [x1]
+  st1.4s {v0, v1, v2, v3}, [x1]
+
+  st1.1d {v0}, [x1]
+  st1.1d {v0, v1}, [x1]
+  st1.1d {v0, v1, v2}, [x1]
+  st1.1d {v0, v1, v2, v3}, [x1]
+
+  st1.2d {v0}, [x1]
+  st1.2d {v0, v1}, [x1]
+  st1.2d {v0, v1, v2}, [x1]
+  st1.2d {v0, v1, v2, v3}, [x1]
+
+  st1.2d {v5}, [x1]
+  st1.2d {v7, v8}, [x10]
+  st1.2d {v11, v12, v13}, [x1]
+  st1.2d {v28, v29, v30, v31}, [x13]
+
+; CHECK: _ld1st1_multiple:
+; CHECK: ld1.8b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x40,0x0c]
+; CHECK: ld1.8b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x40,0x0c]
+; CHECK: ld1.8b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x40,0x0c]
+; CHECK: ld1.8b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x40,0x0c]
+
+; CHECK: ld1.8b { v3 }, [x1]            ; encoding: [0x23,0x70,0x40,0x0c]
+; CHECK: ld1.8b { v3, v4 }, [x2]        ; encoding: [0x43,0xa0,0x40,0x0c]
+; CHECK: ld1.8b { v4, v5, v6 }, [x3]    ; encoding: [0x64,0x60,0x40,0x0c]
+; CHECK: ld1.8b { v7, v8, v9, v10 }, [x4] ; encoding: [0x87,0x20,0x40,0x0c]
+
+; CHECK: ld1.16b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x40,0x4c]
+; CHECK: ld1.16b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x40,0x4c]
+; CHECK: ld1.16b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x40,0x4c]
+; CHECK: ld1.16b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x40,0x4c]
+
+; CHECK: ld1.4h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x40,0x0c]
+; CHECK: ld1.4h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x40,0x0c]
+; CHECK: ld1.4h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x40,0x0c]
+; CHECK: ld1.4h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x40,0x0c]
+
+; CHECK: ld1.8h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x40,0x4c]
+; CHECK: ld1.8h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x40,0x4c]
+; CHECK: ld1.8h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x40,0x4c]
+; CHECK: ld1.8h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x40,0x4c]
+
+; CHECK: ld1.2s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x40,0x0c]
+; CHECK: ld1.2s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x40,0x0c]
+; CHECK: ld1.2s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x40,0x0c]
+; CHECK: ld1.2s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x40,0x0c]
+
+; CHECK: ld1.4s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x40,0x4c]
+; CHECK: ld1.4s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x40,0x4c]
+; CHECK: ld1.4s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x40,0x4c]
+; CHECK: ld1.4s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x40,0x4c]
+
+; CHECK: ld1.1d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x40,0x0c]
+; CHECK: ld1.1d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x40,0x0c]
+; CHECK: ld1.1d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x40,0x0c]
+; CHECK: ld1.1d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x40,0x0c]
+
+; CHECK: ld1.2d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x40,0x4c]
+; CHECK: ld1.2d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x40,0x4c]
+; CHECK: ld1.2d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x40,0x4c]
+; CHECK: ld1.2d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x40,0x4c]
+
+
+; CHECK: st1.8b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x00,0x0c]
+; CHECK: st1.8b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x00,0x0c]
+; CHECK: st1.8b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x00,0x0c]
+; CHECK: st1.8b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x00,0x0c]
+
+; CHECK: st1.16b	{ v0 }, [x1]            ; encoding: [0x20,0x70,0x00,0x4c]
+; CHECK: st1.16b	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa0,0x00,0x4c]
+; CHECK: st1.16b	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x60,0x00,0x4c]
+; CHECK: st1.16b	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x20,0x00,0x4c]
+
+; CHECK: st1.4h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x00,0x0c]
+; CHECK: st1.4h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x00,0x0c]
+; CHECK: st1.4h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x00,0x0c]
+; CHECK: st1.4h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x00,0x0c]
+
+; CHECK: st1.8h	{ v0 }, [x1]            ; encoding: [0x20,0x74,0x00,0x4c]
+; CHECK: st1.8h	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa4,0x00,0x4c]
+; CHECK: st1.8h	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x64,0x00,0x4c]
+; CHECK: st1.8h	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x24,0x00,0x4c]
+
+; CHECK: st1.2s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x00,0x0c]
+; CHECK: st1.2s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x00,0x0c]
+; CHECK: st1.2s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x00,0x0c]
+; CHECK: st1.2s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x00,0x0c]
+
+; CHECK: st1.4s	{ v0 }, [x1]            ; encoding: [0x20,0x78,0x00,0x4c]
+; CHECK: st1.4s	{ v0, v1 }, [x1]        ; encoding: [0x20,0xa8,0x00,0x4c]
+; CHECK: st1.4s	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x68,0x00,0x4c]
+; CHECK: st1.4s	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x28,0x00,0x4c]
+
+; CHECK: st1.1d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x00,0x0c]
+; CHECK: st1.1d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x00,0x0c]
+; CHECK: st1.1d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x00,0x0c]
+; CHECK: st1.1d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x00,0x0c]
+
+; CHECK: st1.2d	{ v0 }, [x1]            ; encoding: [0x20,0x7c,0x00,0x4c]
+; CHECK: st1.2d	{ v0, v1 }, [x1]        ; encoding: [0x20,0xac,0x00,0x4c]
+; CHECK: st1.2d	{ v0, v1, v2 }, [x1]    ; encoding: [0x20,0x6c,0x00,0x4c]
+; CHECK: st1.2d	{ v0, v1, v2, v3 }, [x1] ; encoding: [0x20,0x2c,0x00,0x4c]
+
+; CHECK: st1.2d { v5 }, [x1]            ; encoding: [0x25,0x7c,0x00,0x4c]
+; CHECK: st1.2d { v7, v8 }, [x10]       ; encoding: [0x47,0xad,0x00,0x4c]
+; CHECK: st1.2d { v11, v12, v13 }, [x1] ; encoding: [0x2b,0x6c,0x00,0x4c]
+; CHECK: st1.2d { v28, v29, v30, v31 }, [x13] ; encoding: [0xbc,0x2d,0x00,0x4c]
+
+_ld2st2_multiple:
+  ld2.8b {v4, v5}, [x19]
+  ld2.16b {v4, v5}, [x19]
+  ld2.4h {v4, v5}, [x19]
+  ld2.8h {v4, v5}, [x19]
+  ld2.2s {v4, v5}, [x19]
+  ld2.4s {v4, v5}, [x19]
+  ld2.2d {v4, v5}, [x19]
+
+  st2.8b {v4, v5}, [x19]
+  st2.16b {v4, v5}, [x19]
+  st2.4h {v4, v5}, [x19]
+  st2.8h {v4, v5}, [x19]
+  st2.2s {v4, v5}, [x19]
+  st2.4s {v4, v5}, [x19]
+  st2.2d {v4, v5}, [x19]
+
+
+; CHECK: _ld2st2_multiple
+; CHECK: ld2.8b { v4, v5 }, [x19]       ; encoding: [0x64,0x82,0x40,0x0c]
+; CHECK: ld2.16b { v4, v5 }, [x19]      ; encoding: [0x64,0x82,0x40,0x4c]
+; CHECK: ld2.4h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x40,0x0c]
+; CHECK: ld2.8h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x40,0x4c]
+; CHECK: ld2.2s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x40,0x0c]
+; CHECK: ld2.4s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x40,0x4c]
+; CHECK: ld2.2d { v4, v5 }, [x19]       ; encoding: [0x64,0x8e,0x40,0x4c]
+
+; CHECK: st2.8b { v4, v5 }, [x19]       ; encoding: [0x64,0x82,0x00,0x0c]
+; CHECK: st2.16b { v4, v5 }, [x19]      ; encoding: [0x64,0x82,0x00,0x4c]
+; CHECK: st2.4h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x00,0x0c]
+; CHECK: st2.8h { v4, v5 }, [x19]       ; encoding: [0x64,0x86,0x00,0x4c]
+; CHECK: st2.2s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x00,0x0c]
+; CHECK: st2.4s { v4, v5 }, [x19]       ; encoding: [0x64,0x8a,0x00,0x4c]
+; CHECK: st2.2d { v4, v5 }, [x19]       ; encoding: [0x64,0x8e,0x00,0x4c]
+
+
+ld3st3_multiple:
+    ld3.8b {v4, v5, v6}, [x19]
+    ld3.16b {v4, v5, v6}, [x19]
+    ld3.4h {v4, v5, v6}, [x19]
+    ld3.8h {v4, v5, v6}, [x19]
+    ld3.2s {v4, v5, v6}, [x19]
+    ld3.4s {v4, v5, v6}, [x19]
+    ld3.2d {v4, v5, v6}, [x19]
+
+    ld3.8b {v9, v10, v11}, [x9]
+    ld3.16b {v14, v15, v16}, [x19]
+    ld3.4h {v24, v25, v26}, [x29]
+    ld3.8h {v30, v31, v0}, [x9]
+    ld3.2s {v2, v3, v4}, [x19]
+    ld3.4s {v4, v5, v6}, [x29]
+    ld3.2d {v7, v8, v9}, [x9]
+
+    st3.8b {v4, v5, v6}, [x19]
+    st3.16b {v4, v5, v6}, [x19]
+    st3.4h {v4, v5, v6}, [x19]
+    st3.8h {v4, v5, v6}, [x19]
+    st3.2s {v4, v5, v6}, [x19]
+    st3.4s {v4, v5, v6}, [x19]
+    st3.2d {v4, v5, v6}, [x19]
+
+    st3.8b {v10, v11, v12}, [x9]
+    st3.16b {v14, v15, v16}, [x19]
+    st3.4h {v24, v25, v26}, [x29]
+    st3.8h {v30, v31, v0}, [x9]
+    st3.2s {v2, v3, v4}, [x19]
+    st3.4s {v7, v8, v9}, [x29]
+    st3.2d {v4, v5, v6}, [x9]
+
+; CHECK: ld3st3_multiple:
+; CHECK: ld3.8b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x40,0x0c]
+; CHECK: ld3.16b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x40,0x4c]
+; CHECK: ld3.4h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x40,0x0c]
+; CHECK: ld3.8h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x40,0x4c]
+; CHECK: ld3.2s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x40,0x0c]
+; CHECK: ld3.4s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x40,0x4c]
+; CHECK: ld3.2d { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4e,0x40,0x4c]
+
+; CHECK: ld3.8b { v9, v10, v11 }, [x9]  ; encoding: [0x29,0x41,0x40,0x0c]
+; CHECK: ld3.16b { v14, v15, v16 }, [x19] ; encoding: [0x6e,0x42,0x40,0x4c]
+; CHECK: ld3.4h { v24, v25, v26 }, [fp] ; encoding: [0xb8,0x47,0x40,0x0c]
+; CHECK: ld3.8h { v30, v31, v0 }, [x9]  ; encoding: [0x3e,0x45,0x40,0x4c]
+; CHECK: ld3.2s { v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x0c]
+; CHECK: ld3.4s { v4, v5, v6 }, [fp]    ; encoding: [0xa4,0x4b,0x40,0x4c]
+; CHECK: ld3.2d { v7, v8, v9 }, [x9]    ; encoding: [0x27,0x4d,0x40,0x4c]
+
+; CHECK: st3.8b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x00,0x0c]
+; CHECK: st3.16b { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x42,0x00,0x4c]
+; CHECK: st3.4h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x00,0x0c]
+; CHECK: st3.8h { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x46,0x00,0x4c]
+; CHECK: st3.2s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x00,0x0c]
+; CHECK: st3.4s { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4a,0x00,0x4c]
+; CHECK: st3.2d { v4, v5, v6 }, [x19]   ; encoding: [0x64,0x4e,0x00,0x4c]
+
+; CHECK: st3.8b { v10, v11, v12 }, [x9] ; encoding: [0x2a,0x41,0x00,0x0c]
+; CHECK: st3.16b { v14, v15, v16 }, [x19] ; encoding: [0x6e,0x42,0x00,0x4c]
+; CHECK: st3.4h { v24, v25, v26 }, [fp] ; encoding: [0xb8,0x47,0x00,0x0c]
+; CHECK: st3.8h { v30, v31, v0 }, [x9]  ; encoding: [0x3e,0x45,0x00,0x4c]
+; CHECK: st3.2s { v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x0c]
+; CHECK: st3.4s { v7, v8, v9 }, [fp]    ; encoding: [0xa7,0x4b,0x00,0x4c]
+; CHECK: st3.2d { v4, v5, v6 }, [x9]    ; encoding: [0x24,0x4d,0x00,0x4c]
+
+ld4st4_multiple:
+    ld4.8b {v4, v5, v6, v7}, [x19]
+    ld4.16b {v4, v5, v6, v7}, [x19]
+    ld4.4h {v4, v5, v6, v7}, [x19]
+    ld4.8h {v4, v5, v6, v7}, [x19]
+    ld4.2s {v4, v5, v6, v7}, [x19]
+    ld4.4s {v4, v5, v6, v7}, [x19]
+    ld4.2d {v4, v5, v6, v7}, [x19]
+
+    st4.8b {v4, v5, v6, v7}, [x19]
+    st4.16b {v4, v5, v6, v7}, [x19]
+    st4.4h {v4, v5, v6, v7}, [x19]
+    st4.8h {v4, v5, v6, v7}, [x19]
+    st4.2s {v4, v5, v6, v7}, [x19]
+    st4.4s {v4, v5, v6, v7}, [x19]
+    st4.2d {v4, v5, v6, v7}, [x19]
+
+; CHECK: ld4st4_multiple:
+; CHECK: ld4.8b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x40,0x0c]
+; CHECK: ld4.16b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x40,0x4c]
+; CHECK: ld4.4h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x40,0x0c]
+; CHECK: ld4.8h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x40,0x4c]
+; CHECK: ld4.2s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x40,0x0c]
+; CHECK: ld4.4s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x40,0x4c]
+; CHECK: ld4.2d { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0e,0x40,0x4c]
+
+; CHECK: st4.8b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x00,0x0c]
+; CHECK: st4.16b { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x02,0x00,0x4c]
+; CHECK: st4.4h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x00,0x0c]
+; CHECK: st4.8h { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x06,0x00,0x4c]
+; CHECK: st4.2s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x00,0x0c]
+; CHECK: st4.4s { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0a,0x00,0x4c]
+; CHECK: st4.2d { v4, v5, v6, v7 }, [x19] ; encoding: [0x64,0x0e,0x00,0x4c]
+
+;-----------------------------------------------------------------------------
+; Post-increment versions.
+;-----------------------------------------------------------------------------
+
+_ld1st1_multiple_post:
+  ld1.8b {v0}, [x1], x15
+  ld1.8b {v0, v1}, [x1], x15
+  ld1.8b {v0, v1, v2}, [x1], x15
+  ld1.8b {v0, v1, v2, v3}, [x1], x15
+
+  ld1.16b {v0}, [x1], x15
+  ld1.16b {v0, v1}, [x1], x15
+  ld1.16b {v0, v1, v2}, [x1], x15
+  ld1.16b {v0, v1, v2, v3}, [x1], x15
+
+  ld1.4h {v0}, [x1], x15
+  ld1.4h {v0, v1}, [x1], x15
+  ld1.4h {v0, v1, v2}, [x1], x15
+  ld1.4h {v0, v1, v2, v3}, [x1], x15
+
+  ld1.8h {v0}, [x1], x15
+  ld1.8h {v0, v1}, [x1], x15
+  ld1.8h {v0, v1, v2}, [x1], x15
+  ld1.8h {v0, v1, v2, v3}, [x1], x15
+
+  ld1.2s {v0}, [x1], x15
+  ld1.2s {v0, v1}, [x1], x15
+  ld1.2s {v0, v1, v2}, [x1], x15
+  ld1.2s {v0, v1, v2, v3}, [x1], x15
+
+  ld1.4s {v0}, [x1], x15
+  ld1.4s {v0, v1}, [x1], x15
+  ld1.4s {v0, v1, v2}, [x1], x15
+  ld1.4s {v0, v1, v2, v3}, [x1], x15
+
+  ld1.1d {v0}, [x1], x15
+  ld1.1d {v0, v1}, [x1], x15
+  ld1.1d {v0, v1, v2}, [x1], x15
+  ld1.1d {v0, v1, v2, v3}, [x1], x15
+
+  ld1.2d {v0}, [x1], x15
+  ld1.2d {v0, v1}, [x1], x15
+  ld1.2d {v0, v1, v2}, [x1], x15
+  ld1.2d {v0, v1, v2, v3}, [x1], x15
+
+  st1.8b {v0}, [x1], x15
+  st1.8b {v0, v1}, [x1], x15
+  st1.8b {v0, v1, v2}, [x1], x15
+  st1.8b {v0, v1, v2, v3}, [x1], x15
+
+  st1.16b {v0}, [x1], x15
+  st1.16b {v0, v1}, [x1], x15
+  st1.16b {v0, v1, v2}, [x1], x15
+  st1.16b {v0, v1, v2, v3}, [x1], x15
+
+  st1.4h {v0}, [x1], x15
+  st1.4h {v0, v1}, [x1], x15
+  st1.4h {v0, v1, v2}, [x1], x15
+  st1.4h {v0, v1, v2, v3}, [x1], x15
+
+  st1.8h {v0}, [x1], x15
+  st1.8h {v0, v1}, [x1], x15
+  st1.8h {v0, v1, v2}, [x1], x15
+  st1.8h {v0, v1, v2, v3}, [x1], x15
+
+  st1.2s {v0}, [x1], x15
+  st1.2s {v0, v1}, [x1], x15
+  st1.2s {v0, v1, v2}, [x1], x15
+  st1.2s {v0, v1, v2, v3}, [x1], x15
+
+  st1.4s {v0}, [x1], x15
+  st1.4s {v0, v1}, [x1], x15
+  st1.4s {v0, v1, v2}, [x1], x15
+  st1.4s {v0, v1, v2, v3}, [x1], x15
+
+  st1.1d {v0}, [x1], x15
+  st1.1d {v0, v1}, [x1], x15
+  st1.1d {v0, v1, v2}, [x1], x15
+  st1.1d {v0, v1, v2, v3}, [x1], x15
+
+  st1.2d {v0}, [x1], x15
+  st1.2d {v0, v1}, [x1], x15
+  st1.2d {v0, v1, v2}, [x1], x15
+  st1.2d {v0, v1, v2, v3}, [x1], x15
+
+  ld1.8b {v0}, [x1], #8
+  ld1.8b {v0, v1}, [x1], #16
+  ld1.8b {v0, v1, v2}, [x1], #24
+  ld1.8b {v0, v1, v2, v3}, [x1], #32
+
+  ld1.16b {v0}, [x1], #16
+  ld1.16b {v0, v1}, [x1], #32
+  ld1.16b {v0, v1, v2}, [x1], #48
+  ld1.16b {v0, v1, v2, v3}, [x1], #64
+
+  ld1.4h {v0}, [x1], #8
+  ld1.4h {v0, v1}, [x1], #16
+  ld1.4h {v0, v1, v2}, [x1], #24
+  ld1.4h {v0, v1, v2, v3}, [x1], #32
+
+  ld1.8h {v0}, [x1], #16
+  ld1.8h {v0, v1}, [x1], #32
+  ld1.8h {v0, v1, v2}, [x1], #48
+  ld1.8h {v0, v1, v2, v3}, [x1], #64
+
+  ld1.2s {v0}, [x1], #8
+  ld1.2s {v0, v1}, [x1], #16
+  ld1.2s {v0, v1, v2}, [x1], #24
+  ld1.2s {v0, v1, v2, v3}, [x1], #32
+
+  ld1.4s {v0}, [x1], #16
+  ld1.4s {v0, v1}, [x1], #32
+  ld1.4s {v0, v1, v2}, [x1], #48
+  ld1.4s {v0, v1, v2, v3}, [x1], #64
+
+  ld1.1d {v0}, [x1], #8
+  ld1.1d {v0, v1}, [x1], #16
+  ld1.1d {v0, v1, v2}, [x1], #24
+  ld1.1d {v0, v1, v2, v3}, [x1], #32
+
+  ld1.2d {v0}, [x1], #16
+  ld1.2d {v0, v1}, [x1], #32
+  ld1.2d {v0, v1, v2}, [x1], #48
+  ld1.2d {v0, v1, v2, v3}, [x1], #64
+
+  st1.8b {v0}, [x1], #8
+  st1.8b {v0, v1}, [x1], #16
+  st1.8b {v0, v1, v2}, [x1], #24
+  st1.8b {v0, v1, v2, v3}, [x1], #32
+
+  st1.16b {v0}, [x1], #16
+  st1.16b {v0, v1}, [x1], #32
+  st1.16b {v0, v1, v2}, [x1], #48
+  st1.16b {v0, v1, v2, v3}, [x1], #64
+
+  st1.4h {v0}, [x1], #8
+  st1.4h {v0, v1}, [x1], #16
+  st1.4h {v0, v1, v2}, [x1], #24
+  st1.4h {v0, v1, v2, v3}, [x1], #32
+
+  st1.8h {v0}, [x1], #16
+  st1.8h {v0, v1}, [x1], #32
+  st1.8h {v0, v1, v2}, [x1], #48
+  st1.8h {v0, v1, v2, v3}, [x1], #64
+
+  st1.2s {v0}, [x1], #8
+  st1.2s {v0, v1}, [x1], #16
+  st1.2s {v0, v1, v2}, [x1], #24
+  st1.2s {v0, v1, v2, v3}, [x1], #32
+
+  st1.4s {v0}, [x1], #16
+  st1.4s {v0, v1}, [x1], #32
+  st1.4s {v0, v1, v2}, [x1], #48
+  st1.4s {v0, v1, v2, v3}, [x1], #64
+
+  st1.1d {v0}, [x1], #8
+  st1.1d {v0, v1}, [x1], #16
+  st1.1d {v0, v1, v2}, [x1], #24
+  st1.1d {v0, v1, v2, v3}, [x1], #32
+
+  st1.2d {v0}, [x1], #16
+  st1.2d {v0, v1}, [x1], #32
+  st1.2d {v0, v1, v2}, [x1], #48
+  st1.2d {v0, v1, v2, v3}, [x1], #64
+
+; CHECK: ld1st1_multiple_post:
+; CHECK: ld1.8b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0xcf,0x0c]
+; CHECK: ld1.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0xcf,0x0c]
+; CHECK: ld1.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0xcf,0x0c]
+; CHECK: ld1.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0xcf,0x0c]
+
+; CHECK: ld1.16b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0xcf,0x4c]
+; CHECK: ld1.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0xcf,0x4c]
+; CHECK: ld1.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0xcf,0x4c]
+; CHECK: ld1.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0xcf,0x4c]
+
+; CHECK: ld1.4h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0xcf,0x0c]
+; CHECK: ld1.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0xcf,0x0c]
+; CHECK: ld1.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0xcf,0x0c]
+; CHECK: ld1.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0xcf,0x0c]
+
+; CHECK: ld1.8h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0xcf,0x4c]
+; CHECK: ld1.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0xcf,0x4c]
+; CHECK: ld1.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0xcf,0x4c]
+; CHECK: ld1.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0xcf,0x4c]
+
+; CHECK: ld1.2s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0xcf,0x0c]
+; CHECK: ld1.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0xcf,0x0c]
+; CHECK: ld1.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0xcf,0x0c]
+; CHECK: ld1.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0xcf,0x0c]
+
+; CHECK: ld1.4s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0xcf,0x4c]
+; CHECK: ld1.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0xcf,0x4c]
+; CHECK: ld1.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0xcf,0x4c]
+; CHECK: ld1.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0xcf,0x4c]
+
+; CHECK: ld1.1d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0xcf,0x0c]
+; CHECK: ld1.1d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0xcf,0x0c]
+; CHECK: ld1.1d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0xcf,0x0c]
+; CHECK: ld1.1d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0xcf,0x0c]
+
+; CHECK: ld1.2d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0xcf,0x4c]
+; CHECK: ld1.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0xcf,0x4c]
+; CHECK: ld1.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0xcf,0x4c]
+; CHECK: ld1.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0xcf,0x4c]
+
+; CHECK: st1.8b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0x8f,0x0c]
+; CHECK: st1.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0x8f,0x0c]
+; CHECK: st1.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0x8f,0x0c]
+; CHECK: st1.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0x8f,0x0c]
+
+; CHECK: st1.16b { v0 }, [x1], x15       ; encoding: [0x20,0x70,0x8f,0x4c]
+; CHECK: st1.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa0,0x8f,0x4c]
+; CHECK: st1.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x60,0x8f,0x4c]
+; CHECK: st1.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x20,0x8f,0x4c]
+
+; CHECK: st1.4h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0x8f,0x0c]
+; CHECK: st1.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0x8f,0x0c]
+; CHECK: st1.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0x8f,0x0c]
+; CHECK: st1.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0x8f,0x0c]
+
+; CHECK: st1.8h { v0 }, [x1], x15       ; encoding: [0x20,0x74,0x8f,0x4c]
+; CHECK: st1.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa4,0x8f,0x4c]
+; CHECK: st1.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x64,0x8f,0x4c]
+; CHECK: st1.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x24,0x8f,0x4c]
+
+; CHECK: st1.2s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0x8f,0x0c]
+; CHECK: st1.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0x8f,0x0c]
+; CHECK: st1.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0x8f,0x0c]
+; CHECK: st1.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0x8f,0x0c]
+
+; CHECK: st1.4s { v0 }, [x1], x15       ; encoding: [0x20,0x78,0x8f,0x4c]
+; CHECK: st1.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0xa8,0x8f,0x4c]
+; CHECK: st1.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x68,0x8f,0x4c]
+; CHECK: st1.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x28,0x8f,0x4c]
+
+; CHECK: st1.1d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0x8f,0x0c]
+; CHECK: st1.1d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0x8f,0x0c]
+; CHECK: st1.1d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0x8f,0x0c]
+; CHECK: st1.1d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0x8f,0x0c]
+
+; CHECK: st1.2d { v0 }, [x1], x15       ; encoding: [0x20,0x7c,0x8f,0x4c]
+; CHECK: st1.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0xac,0x8f,0x4c]
+; CHECK: st1.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x6c,0x8f,0x4c]
+; CHECK: st1.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x2c,0x8f,0x4c]
+
+; CHECK: ld1.8b { v0 }, [x1], #8       ; encoding: [0x20,0x70,0xdf,0x0c]
+; CHECK: ld1.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa0,0xdf,0x0c]
+; CHECK: ld1.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x60,0xdf,0x0c]
+; CHECK: ld1.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x20,0xdf,0x0c]
+
+; CHECK: ld1.16b { v0 }, [x1], #16       ; encoding: [0x20,0x70,0xdf,0x4c]
+; CHECK: ld1.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa0,0xdf,0x4c]
+; CHECK: ld1.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x60,0xdf,0x4c]
+; CHECK: ld1.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x20,0xdf,0x4c]
+
+; CHECK: ld1.4h { v0 }, [x1], #8       ; encoding: [0x20,0x74,0xdf,0x0c]
+; CHECK: ld1.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa4,0xdf,0x0c]
+; CHECK: ld1.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x64,0xdf,0x0c]
+; CHECK: ld1.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x24,0xdf,0x0c]
+
+; CHECK: ld1.8h { v0 }, [x1], #16       ; encoding: [0x20,0x74,0xdf,0x4c]
+; CHECK: ld1.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa4,0xdf,0x4c]
+; CHECK: ld1.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x64,0xdf,0x4c]
+; CHECK: ld1.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x24,0xdf,0x4c]
+
+; CHECK: ld1.2s { v0 }, [x1], #8       ; encoding: [0x20,0x78,0xdf,0x0c]
+; CHECK: ld1.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa8,0xdf,0x0c]
+; CHECK: ld1.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x68,0xdf,0x0c]
+; CHECK: ld1.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x28,0xdf,0x0c]
+
+; CHECK: ld1.4s { v0 }, [x1], #16       ; encoding: [0x20,0x78,0xdf,0x4c]
+; CHECK: ld1.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa8,0xdf,0x4c]
+; CHECK: ld1.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x68,0xdf,0x4c]
+; CHECK: ld1.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x28,0xdf,0x4c]
+
+; CHECK: ld1.1d { v0 }, [x1], #8       ; encoding: [0x20,0x7c,0xdf,0x0c]
+; CHECK: ld1.1d { v0, v1 }, [x1], #16   ; encoding: [0x20,0xac,0xdf,0x0c]
+; CHECK: ld1.1d { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x6c,0xdf,0x0c]
+; CHECK: ld1.1d { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x2c,0xdf,0x0c]
+
+; CHECK: ld1.2d { v0 }, [x1], #16       ; encoding: [0x20,0x7c,0xdf,0x4c]
+; CHECK: ld1.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0xac,0xdf,0x4c]
+; CHECK: ld1.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x6c,0xdf,0x4c]
+; CHECK: ld1.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x2c,0xdf,0x4c]
+
+; CHECK: st1.8b { v0 }, [x1], #8       ; encoding: [0x20,0x70,0x9f,0x0c]
+; CHECK: st1.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa0,0x9f,0x0c]
+; CHECK: st1.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x60,0x9f,0x0c]
+; CHECK: st1.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x20,0x9f,0x0c]
+
+; CHECK: st1.16b { v0 }, [x1], #16       ; encoding: [0x20,0x70,0x9f,0x4c]
+; CHECK: st1.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa0,0x9f,0x4c]
+; CHECK: st1.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x60,0x9f,0x4c]
+; CHECK: st1.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x20,0x9f,0x4c]
+
+; CHECK: st1.4h { v0 }, [x1], #8       ; encoding: [0x20,0x74,0x9f,0x0c]
+; CHECK: st1.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa4,0x9f,0x0c]
+; CHECK: st1.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x64,0x9f,0x0c]
+; CHECK: st1.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x24,0x9f,0x0c]
+
+; CHECK: st1.8h { v0 }, [x1], #16       ; encoding: [0x20,0x74,0x9f,0x4c]
+; CHECK: st1.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa4,0x9f,0x4c]
+; CHECK: st1.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x64,0x9f,0x4c]
+; CHECK: st1.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x24,0x9f,0x4c]
+
+; CHECK: st1.2s { v0 }, [x1], #8       ; encoding: [0x20,0x78,0x9f,0x0c]
+; CHECK: st1.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0xa8,0x9f,0x0c]
+; CHECK: st1.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x68,0x9f,0x0c]
+; CHECK: st1.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x28,0x9f,0x0c]
+
+; CHECK: st1.4s { v0 }, [x1], #16       ; encoding: [0x20,0x78,0x9f,0x4c]
+; CHECK: st1.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0xa8,0x9f,0x4c]
+; CHECK: st1.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x68,0x9f,0x4c]
+; CHECK: st1.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x28,0x9f,0x4c]
+
+; CHECK: st1.1d { v0 }, [x1], #8       ; encoding: [0x20,0x7c,0x9f,0x0c]
+; CHECK: st1.1d { v0, v1 }, [x1], #16   ; encoding: [0x20,0xac,0x9f,0x0c]
+; CHECK: st1.1d { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x6c,0x9f,0x0c]
+; CHECK: st1.1d { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x2c,0x9f,0x0c]
+
+; CHECK: st1.2d { v0 }, [x1], #16       ; encoding: [0x20,0x7c,0x9f,0x4c]
+; CHECK: st1.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0xac,0x9f,0x4c]
+; CHECK: st1.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x6c,0x9f,0x4c]
+; CHECK: st1.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x2c,0x9f,0x4c]
+
+
+_ld2st2_multiple_post:
+  ld2.8b {v0, v1}, [x1], x15
+  ld2.16b {v0, v1}, [x1], x15
+  ld2.4h {v0, v1}, [x1], x15
+  ld2.8h {v0, v1}, [x1], x15
+  ld2.2s {v0, v1}, [x1], x15
+  ld2.4s {v0, v1}, [x1], x15
+  ld2.2d {v0, v1}, [x1], x15
+
+  st2.8b {v0, v1}, [x1], x15
+  st2.16b {v0, v1}, [x1], x15
+  st2.4h {v0, v1}, [x1], x15
+  st2.8h {v0, v1}, [x1], x15
+  st2.2s {v0, v1}, [x1], x15
+  st2.4s {v0, v1}, [x1], x15
+  st2.2d {v0, v1}, [x1], x15
+
+  ld2.8b {v0, v1}, [x1], #16
+  ld2.16b {v0, v1}, [x1], #32
+  ld2.4h {v0, v1}, [x1], #16
+  ld2.8h {v0, v1}, [x1], #32
+  ld2.2s {v0, v1}, [x1], #16
+  ld2.4s {v0, v1}, [x1], #32
+  ld2.2d {v0, v1}, [x1], #32
+
+  st2.8b {v0, v1}, [x1], #16
+  st2.16b {v0, v1}, [x1], #32
+  st2.4h {v0, v1}, [x1], #16
+  st2.8h {v0, v1}, [x1], #32
+  st2.2s {v0, v1}, [x1], #16
+  st2.4s {v0, v1}, [x1], #32
+  st2.2d {v0, v1}, [x1], #32
+
+
+; CHECK: ld2st2_multiple_post:
+; CHECK: ld2.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0xcf,0x0c]
+; CHECK: ld2.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0xcf,0x4c]
+; CHECK: ld2.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0xcf,0x0c]
+; CHECK: ld2.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0xcf,0x4c]
+; CHECK: ld2.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0xcf,0x0c]
+; CHECK: ld2.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0xcf,0x4c]
+; CHECK: ld2.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0x8c,0xcf,0x4c]
+
+; CHECK: st2.8b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0x8f,0x0c]
+; CHECK: st2.16b { v0, v1 }, [x1], x15   ; encoding: [0x20,0x80,0x8f,0x4c]
+; CHECK: st2.4h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0x8f,0x0c]
+; CHECK: st2.8h { v0, v1 }, [x1], x15   ; encoding: [0x20,0x84,0x8f,0x4c]
+; CHECK: st2.2s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0x8f,0x0c]
+; CHECK: st2.4s { v0, v1 }, [x1], x15   ; encoding: [0x20,0x88,0x8f,0x4c]
+; CHECK: st2.2d { v0, v1 }, [x1], x15   ; encoding: [0x20,0x8c,0x8f,0x4c]
+
+; CHECK: ld2.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0x80,0xdf,0x0c]
+; CHECK: ld2.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0x80,0xdf,0x4c]
+; CHECK: ld2.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0x84,0xdf,0x0c]
+; CHECK: ld2.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0x84,0xdf,0x4c]
+; CHECK: ld2.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0x88,0xdf,0x0c]
+; CHECK: ld2.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0x88,0xdf,0x4c]
+; CHECK: ld2.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0x8c,0xdf,0x4c]
+
+; CHECK: st2.8b { v0, v1 }, [x1], #16   ; encoding: [0x20,0x80,0x9f,0x0c]
+; CHECK: st2.16b { v0, v1 }, [x1], #32   ; encoding: [0x20,0x80,0x9f,0x4c]
+; CHECK: st2.4h { v0, v1 }, [x1], #16   ; encoding: [0x20,0x84,0x9f,0x0c]
+; CHECK: st2.8h { v0, v1 }, [x1], #32   ; encoding: [0x20,0x84,0x9f,0x4c]
+; CHECK: st2.2s { v0, v1 }, [x1], #16   ; encoding: [0x20,0x88,0x9f,0x0c]
+; CHECK: st2.4s { v0, v1 }, [x1], #32   ; encoding: [0x20,0x88,0x9f,0x4c]
+; CHECK: st2.2d { v0, v1 }, [x1], #32   ; encoding: [0x20,0x8c,0x9f,0x4c]
+
+
+_ld3st3_multiple_post:
+  ld3.8b {v0, v1, v2}, [x1], x15
+  ld3.16b {v0, v1, v2}, [x1], x15
+  ld3.4h {v0, v1, v2}, [x1], x15
+  ld3.8h {v0, v1, v2}, [x1], x15
+  ld3.2s {v0, v1, v2}, [x1], x15
+  ld3.4s {v0, v1, v2}, [x1], x15
+  ld3.2d {v0, v1, v2}, [x1], x15
+
+  st3.8b {v0, v1, v2}, [x1], x15
+  st3.16b {v0, v1, v2}, [x1], x15
+  st3.4h {v0, v1, v2}, [x1], x15
+  st3.8h {v0, v1, v2}, [x1], x15
+  st3.2s {v0, v1, v2}, [x1], x15
+  st3.4s {v0, v1, v2}, [x1], x15
+  st3.2d {v0, v1, v2}, [x1], x15
+
+  ld3.8b {v0, v1, v2}, [x1], #24
+  ld3.16b {v0, v1, v2}, [x1], #48
+  ld3.4h {v0, v1, v2}, [x1], #24
+  ld3.8h {v0, v1, v2}, [x1], #48
+  ld3.2s {v0, v1, v2}, [x1], #24
+  ld3.4s {v0, v1, v2}, [x1], #48
+  ld3.2d {v0, v1, v2}, [x1], #48
+
+  st3.8b {v0, v1, v2}, [x1], #24
+  st3.16b {v0, v1, v2}, [x1], #48
+  st3.4h {v0, v1, v2}, [x1], #24
+  st3.8h {v0, v1, v2}, [x1], #48
+  st3.2s {v0, v1, v2}, [x1], #24
+  st3.4s {v0, v1, v2}, [x1], #48
+  st3.2d {v0, v1, v2}, [x1], #48
+
+; CHECK: ld3st3_multiple_post:
+; CHECK: ld3.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0xcf,0x0c]
+; CHECK: ld3.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0xcf,0x4c]
+; CHECK: ld3.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0xcf,0x0c]
+; CHECK: ld3.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0xcf,0x4c]
+; CHECK: ld3.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0xcf,0x0c]
+; CHECK: ld3.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0xcf,0x4c]
+; CHECK: ld3.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x4c,0xcf,0x4c]
+
+; CHECK: st3.8b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0x8f,0x0c]
+; CHECK: st3.16b { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x40,0x8f,0x4c]
+; CHECK: st3.4h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0x8f,0x0c]
+; CHECK: st3.8h { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x44,0x8f,0x4c]
+; CHECK: st3.2s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0x8f,0x0c]
+; CHECK: st3.4s { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x48,0x8f,0x4c]
+; CHECK: st3.2d { v0, v1, v2 }, [x1], x15 ; encoding: [0x20,0x4c,0x8f,0x4c]
+
+; CHECK: ld3.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x40,0xdf,0x0c]
+; CHECK: ld3.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x40,0xdf,0x4c]
+; CHECK: ld3.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x44,0xdf,0x0c]
+; CHECK: ld3.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x44,0xdf,0x4c]
+; CHECK: ld3.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x48,0xdf,0x0c]
+; CHECK: ld3.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x48,0xdf,0x4c]
+; CHECK: ld3.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x4c,0xdf,0x4c]
+
+; CHECK: st3.8b { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x40,0x9f,0x0c]
+; CHECK: st3.16b { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x40,0x9f,0x4c]
+; CHECK: st3.4h { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x44,0x9f,0x0c]
+; CHECK: st3.8h { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x44,0x9f,0x4c]
+; CHECK: st3.2s { v0, v1, v2 }, [x1], #24 ; encoding: [0x20,0x48,0x9f,0x0c]
+; CHECK: st3.4s { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x48,0x9f,0x4c]
+; CHECK: st3.2d { v0, v1, v2 }, [x1], #48 ; encoding: [0x20,0x4c,0x9f,0x4c]
+
+_ld4st4_multiple_post:
+  ld4.8b {v0, v1, v2, v3}, [x1], x15
+  ld4.16b {v0, v1, v2, v3}, [x1], x15
+  ld4.4h {v0, v1, v2, v3}, [x1], x15
+  ld4.8h {v0, v1, v2, v3}, [x1], x15
+  ld4.2s {v0, v1, v2, v3}, [x1], x15
+  ld4.4s {v0, v1, v2, v3}, [x1], x15
+  ld4.2d {v0, v1, v2, v3}, [x1], x15
+
+  st4.8b {v0, v1, v2, v3}, [x1], x15
+  st4.16b {v0, v1, v2, v3}, [x1], x15
+  st4.4h {v0, v1, v2, v3}, [x1], x15
+  st4.8h {v0, v1, v2, v3}, [x1], x15
+  st4.2s {v0, v1, v2, v3}, [x1], x15
+  st4.4s {v0, v1, v2, v3}, [x1], x15
+  st4.2d {v0, v1, v2, v3}, [x1], x15
+
+  ld4.8b {v0, v1, v2, v3}, [x1], #32
+  ld4.16b {v0, v1, v2, v3}, [x1], #64
+  ld4.4h {v0, v1, v2, v3}, [x1], #32
+  ld4.8h {v0, v1, v2, v3}, [x1], #64
+  ld4.2s {v0, v1, v2, v3}, [x1], #32
+  ld4.4s {v0, v1, v2, v3}, [x1], #64
+  ld4.2d {v0, v1, v2, v3}, [x1], #64
+
+  st4.8b {v0, v1, v2, v3}, [x1], #32
+  st4.16b {v0, v1, v2, v3}, [x1], #64
+  st4.4h {v0, v1, v2, v3}, [x1], #32
+  st4.8h {v0, v1, v2, v3}, [x1], #64
+  st4.2s {v0, v1, v2, v3}, [x1], #32
+  st4.4s {v0, v1, v2, v3}, [x1], #64
+  st4.2d {v0, v1, v2, v3}, [x1], #64
+
+
+; CHECK: ld4st4_multiple_post:
+; CHECK: ld4.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0xcf,0x0c]
+; CHECK: ld4.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0xcf,0x4c]
+; CHECK: ld4.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0xcf,0x0c]
+; CHECK: ld4.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0xcf,0x4c]
+; CHECK: ld4.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0xcf,0x0c]
+; CHECK: ld4.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0xcf,0x4c]
+; CHECK: ld4.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x0c,0xcf,0x4c]
+
+; CHECK: st4.8b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0x8f,0x0c]
+; CHECK: st4.16b { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x00,0x8f,0x4c]
+; CHECK: st4.4h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0x8f,0x0c]
+; CHECK: st4.8h { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x04,0x8f,0x4c]
+; CHECK: st4.2s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0x8f,0x0c]
+; CHECK: st4.4s { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x08,0x8f,0x4c]
+; CHECK: st4.2d { v0, v1, v2, v3 }, [x1], x15 ; encoding: [0x20,0x0c,0x8f,0x4c]
+
+; CHECK: ld4.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x00,0xdf,0x0c]
+; CHECK: ld4.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x00,0xdf,0x4c]
+; CHECK: ld4.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x04,0xdf,0x0c]
+; CHECK: ld4.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x04,0xdf,0x4c]
+; CHECK: ld4.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x08,0xdf,0x0c]
+; CHECK: ld4.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x08,0xdf,0x4c]
+; CHECK: ld4.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x0c,0xdf,0x4c]
+
+; CHECK: st4.8b { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x00,0x9f,0x0c]
+; CHECK: st4.16b { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x00,0x9f,0x4c]
+; CHECK: st4.4h { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x04,0x9f,0x0c]
+; CHECK: st4.8h { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x04,0x9f,0x4c]
+; CHECK: st4.2s { v0, v1, v2, v3 }, [x1], #32 ; encoding: [0x20,0x08,0x9f,0x0c]
+; CHECK: st4.4s { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x08,0x9f,0x4c]
+; CHECK: st4.2d { v0, v1, v2, v3 }, [x1], #64 ; encoding: [0x20,0x0c,0x9f,0x4c]
+
+ld1r:
+  ld1r.8b {v4}, [x2]
+  ld1r.8b {v4}, [x2], x3
+  ld1r.16b {v4}, [x2]
+  ld1r.16b {v4}, [x2], x3
+  ld1r.4h {v4}, [x2]
+  ld1r.4h {v4}, [x2], x3
+  ld1r.8h {v4}, [x2]
+  ld1r.8h {v4}, [x2], x3
+  ld1r.2s {v4}, [x2]
+  ld1r.2s {v4}, [x2], x3
+  ld1r.4s {v4}, [x2]
+  ld1r.4s {v4}, [x2], x3
+  ld1r.1d {v4}, [x2]
+  ld1r.1d {v4}, [x2], x3
+  ld1r.2d {v4}, [x2]
+  ld1r.2d {v4}, [x2], x3
+
+  ld1r.8b {v4}, [x2], #1
+  ld1r.16b {v4}, [x2], #1
+  ld1r.4h {v4}, [x2], #2
+  ld1r.8h {v4}, [x2], #2
+  ld1r.2s {v4}, [x2], #4
+  ld1r.4s {v4}, [x2], #4
+  ld1r.1d {v4}, [x2], #8
+  ld1r.2d {v4}, [x2], #8
+
+; CHECK: ld1r:
+; CHECK: ld1r.8b { v4 }, [x2]            ; encoding: [0x44,0xc0,0x40,0x0d]
+; CHECK: ld1r.8b { v4 }, [x2], x3        ; encoding: [0x44,0xc0,0xc3,0x0d]
+; CHECK: ld1r.16b { v4 }, [x2]    ; encoding: [0x44,0xc0,0x40,0x4d]
+; CHECK: ld1r.16b { v4 }, [x2], x3 ; encoding: [0x44,0xc0,0xc3,0x4d]
+; CHECK: ld1r.4h { v4 }, [x2]            ; encoding: [0x44,0xc4,0x40,0x0d]
+; CHECK: ld1r.4h { v4 }, [x2], x3        ; encoding: [0x44,0xc4,0xc3,0x0d]
+; CHECK: ld1r.8h { v4 }, [x2]            ; encoding: [0x44,0xc4,0x40,0x4d]
+; CHECK: ld1r.8h { v4 }, [x2], x3        ; encoding: [0x44,0xc4,0xc3,0x4d]
+; CHECK: ld1r.2s { v4 }, [x2]            ; encoding: [0x44,0xc8,0x40,0x0d]
+; CHECK: ld1r.2s { v4 }, [x2], x3        ; encoding: [0x44,0xc8,0xc3,0x0d]
+; CHECK: ld1r.4s { v4 }, [x2]            ; encoding: [0x44,0xc8,0x40,0x4d]
+; CHECK: ld1r.4s { v4 }, [x2], x3        ; encoding: [0x44,0xc8,0xc3,0x4d]
+; CHECK: ld1r.1d { v4 }, [x2]            ; encoding: [0x44,0xcc,0x40,0x0d]
+; CHECK: ld1r.1d { v4 }, [x2], x3        ; encoding: [0x44,0xcc,0xc3,0x0d]
+; CHECK: ld1r.2d { v4 }, [x2]            ; encoding: [0x44,0xcc,0x40,0x4d]
+; CHECK: ld1r.2d { v4 }, [x2], x3        ; encoding: [0x44,0xcc,0xc3,0x4d]
+
+; CHECK: ld1r.8b { v4 }, [x2], #1        ; encoding: [0x44,0xc0,0xdf,0x0d]
+; CHECK: ld1r.16b { v4 }, [x2], #1 ; encoding: [0x44,0xc0,0xdf,0x4d]
+; CHECK: ld1r.4h { v4 }, [x2], #2        ; encoding: [0x44,0xc4,0xdf,0x0d]
+; CHECK: ld1r.8h { v4 }, [x2], #2        ; encoding: [0x44,0xc4,0xdf,0x4d]
+; CHECK: ld1r.2s { v4 }, [x2], #4        ; encoding: [0x44,0xc8,0xdf,0x0d]
+; CHECK: ld1r.4s { v4 }, [x2], #4        ; encoding: [0x44,0xc8,0xdf,0x4d]
+; CHECK: ld1r.1d { v4 }, [x2], #8        ; encoding: [0x44,0xcc,0xdf,0x0d]
+; CHECK: ld1r.2d { v4 }, [x2], #8        ; encoding: [0x44,0xcc,0xdf,0x4d]
+
+ld2r:
+  ld2r.8b {v4, v5}, [x2]
+  ld2r.8b {v4, v5}, [x2], x3
+  ld2r.16b {v4, v5}, [x2]
+  ld2r.16b {v4, v5}, [x2], x3
+  ld2r.4h {v4, v5}, [x2]
+  ld2r.4h {v4, v5}, [x2], x3
+  ld2r.8h {v4, v5}, [x2]
+  ld2r.8h {v4, v5}, [x2], x3
+  ld2r.2s {v4, v5}, [x2]
+  ld2r.2s {v4, v5}, [x2], x3
+  ld2r.4s {v4, v5}, [x2]
+  ld2r.4s {v4, v5}, [x2], x3
+  ld2r.1d {v4, v5}, [x2]
+  ld2r.1d {v4, v5}, [x2], x3
+  ld2r.2d {v4, v5}, [x2]
+  ld2r.2d {v4, v5}, [x2], x3
+
+  ld2r.8b {v4, v5}, [x2], #2
+  ld2r.16b {v4, v5}, [x2], #2
+  ld2r.4h {v4, v5}, [x2], #4
+  ld2r.8h {v4, v5}, [x2], #4
+  ld2r.2s {v4, v5}, [x2], #8
+  ld2r.4s {v4, v5}, [x2], #8
+  ld2r.1d {v4, v5}, [x2], #16
+  ld2r.2d {v4, v5}, [x2], #16
+
+; CHECK: ld2r:
+; CHECK: ld2r.8b { v4, v5 }, [x2]        ; encoding: [0x44,0xc0,0x60,0x0d]
+; CHECK: ld2r.8b { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc0,0xe3,0x0d]
+; CHECK: ld2r.16b { v4, v5 }, [x2] ; encoding: [0x44,0xc0,0x60,0x4d]
+; CHECK: ld2r.16b { v4, v5 }, [x2], x3 ; encoding: [0x44,0xc0,0xe3,0x4d]
+; CHECK: ld2r.4h { v4, v5 }, [x2]        ; encoding: [0x44,0xc4,0x60,0x0d]
+; CHECK: ld2r.4h { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc4,0xe3,0x0d]
+; CHECK: ld2r.8h { v4, v5 }, [x2]        ; encoding: [0x44,0xc4,0x60,0x4d]
+; CHECK: ld2r.8h { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc4,0xe3,0x4d]
+; CHECK: ld2r.2s { v4, v5 }, [x2]        ; encoding: [0x44,0xc8,0x60,0x0d]
+; CHECK: ld2r.2s { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc8,0xe3,0x0d]
+; CHECK: ld2r.4s { v4, v5 }, [x2]        ; encoding: [0x44,0xc8,0x60,0x4d]
+; CHECK: ld2r.4s { v4, v5 }, [x2], x3    ; encoding: [0x44,0xc8,0xe3,0x4d]
+; CHECK: ld2r.1d { v4, v5 }, [x2]        ; encoding: [0x44,0xcc,0x60,0x0d]
+; CHECK: ld2r.1d { v4, v5 }, [x2], x3    ; encoding: [0x44,0xcc,0xe3,0x0d]
+; CHECK: ld2r.2d { v4, v5 }, [x2]        ; encoding: [0x44,0xcc,0x60,0x4d]
+; CHECK: ld2r.2d { v4, v5 }, [x2], x3    ; encoding: [0x44,0xcc,0xe3,0x4d]
+
+; CHECK: ld2r.8b { v4, v5 }, [x2], #2    ; encoding: [0x44,0xc0,0xff,0x0d]
+; CHECK: ld2r.16b { v4, v5 }, [x2], #2 ; encoding: [0x44,0xc0,0xff,0x4d]
+; CHECK: ld2r.4h { v4, v5 }, [x2], #4    ; encoding: [0x44,0xc4,0xff,0x0d]
+; CHECK: ld2r.8h { v4, v5 }, [x2], #4    ; encoding: [0x44,0xc4,0xff,0x4d]
+; CHECK: ld2r.2s { v4, v5 }, [x2], #8    ; encoding: [0x44,0xc8,0xff,0x0d]
+; CHECK: ld2r.4s { v4, v5 }, [x2], #8    ; encoding: [0x44,0xc8,0xff,0x4d]
+; CHECK: ld2r.1d { v4, v5 }, [x2], #16    ; encoding: [0x44,0xcc,0xff,0x0d]
+; CHECK: ld2r.2d { v4, v5 }, [x2], #16    ; encoding: [0x44,0xcc,0xff,0x4d]
+
+ld3r:
+  ld3r.8b {v4, v5, v6}, [x2]
+  ld3r.8b {v4, v5, v6}, [x2], x3
+  ld3r.16b {v4, v5, v6}, [x2]
+  ld3r.16b {v4, v5, v6}, [x2], x3
+  ld3r.4h {v4, v5, v6}, [x2]
+  ld3r.4h {v4, v5, v6}, [x2], x3
+  ld3r.8h {v4, v5, v6}, [x2]
+  ld3r.8h {v4, v5, v6}, [x2], x3
+  ld3r.2s {v4, v5, v6}, [x2]
+  ld3r.2s {v4, v5, v6}, [x2], x3
+  ld3r.4s {v4, v5, v6}, [x2]
+  ld3r.4s {v4, v5, v6}, [x2], x3
+  ld3r.1d {v4, v5, v6}, [x2]
+  ld3r.1d {v4, v5, v6}, [x2], x3
+  ld3r.2d {v4, v5, v6}, [x2]
+  ld3r.2d {v4, v5, v6}, [x2], x3
+
+  ld3r.8b {v4, v5, v6}, [x2], #3
+  ld3r.16b {v4, v5, v6}, [x2], #3
+  ld3r.4h {v4, v5, v6}, [x2], #6
+  ld3r.8h {v4, v5, v6}, [x2], #6
+  ld3r.2s {v4, v5, v6}, [x2], #12
+  ld3r.4s {v4, v5, v6}, [x2], #12
+  ld3r.1d {v4, v5, v6}, [x2], #24
+  ld3r.2d {v4, v5, v6}, [x2], #24
+
+; CHECK: ld3r:
+; CHECK: ld3r.8b { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe0,0x40,0x0d]
+; CHECK: ld3r.8b { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe0,0xc3,0x0d]
+; CHECK: ld3r.16b { v4, v5, v6 }, [x2] ; encoding: [0x44,0xe0,0x40,0x4d]
+; CHECK: ld3r.16b { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe0,0xc3,0x4d]
+; CHECK: ld3r.4h { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe4,0x40,0x0d]
+; CHECK: ld3r.4h { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe4,0xc3,0x0d]
+; CHECK: ld3r.8h { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe4,0x40,0x4d]
+; CHECK: ld3r.8h { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe4,0xc3,0x4d]
+; CHECK: ld3r.2s { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe8,0x40,0x0d]
+; CHECK: ld3r.2s { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe8,0xc3,0x0d]
+; CHECK: ld3r.4s { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xe8,0x40,0x4d]
+; CHECK: ld3r.4s { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xe8,0xc3,0x4d]
+; CHECK: ld3r.1d { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xec,0x40,0x0d]
+; CHECK: ld3r.1d { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xec,0xc3,0x0d]
+; CHECK: ld3r.2d { v4, v5, v6 }, [x2]    ; encoding: [0x44,0xec,0x40,0x4d]
+; CHECK: ld3r.2d { v4, v5, v6 }, [x2], x3 ; encoding: [0x44,0xec,0xc3,0x4d]
+
+; CHECK: ld3r.8b { v4, v5, v6 }, [x2], #3 ; encoding: [0x44,0xe0,0xdf,0x0d]
+; CHECK: ld3r.16b { v4, v5, v6 }, [x2], #3 ; encoding: [0x44,0xe0,0xdf,0x4d]
+; CHECK: ld3r.4h { v4, v5, v6 }, [x2], #6 ; encoding: [0x44,0xe4,0xdf,0x0d]
+; CHECK: ld3r.8h { v4, v5, v6 }, [x2], #6 ; encoding: [0x44,0xe4,0xdf,0x4d]
+; CHECK: ld3r.2s { v4, v5, v6 }, [x2], #12 ; encoding: [0x44,0xe8,0xdf,0x0d]
+; CHECK: ld3r.4s { v4, v5, v6 }, [x2], #12 ; encoding: [0x44,0xe8,0xdf,0x4d]
+; CHECK: ld3r.1d { v4, v5, v6 }, [x2], #24 ; encoding: [0x44,0xec,0xdf,0x0d]
+; CHECK: ld3r.2d { v4, v5, v6 }, [x2], #24 ; encoding: [0x44,0xec,0xdf,0x4d]
+
+ld4r:
+  ld4r.8b {v4, v5, v6, v7}, [x2]
+  ld4r.8b {v4, v5, v6, v7}, [x2], x3
+  ld4r.16b {v4, v5, v6, v7}, [x2]
+  ld4r.16b {v4, v5, v6, v7}, [x2], x3
+  ld4r.4h {v4, v5, v6, v7}, [x2]
+  ld4r.4h {v4, v5, v6, v7}, [x2], x3
+  ld4r.8h {v4, v5, v6, v7}, [x2]
+  ld4r.8h {v4, v5, v6, v7}, [x2], x3
+  ld4r.2s {v4, v5, v6, v7}, [x2]
+  ld4r.2s {v4, v5, v6, v7}, [x2], x3
+  ld4r.4s {v4, v5, v6, v7}, [x2]
+  ld4r.4s {v4, v5, v6, v7}, [x2], x3
+  ld4r.1d {v4, v5, v6, v7}, [x2]
+  ld4r.1d {v4, v5, v6, v7}, [x2], x3
+  ld4r.2d {v4, v5, v6, v7}, [x2]
+  ld4r.2d {v4, v5, v6, v7}, [x2], x3
+
+  ld4r.8b {v4, v5, v6, v7}, [x2], #4
+  ld4r.16b {v5, v6, v7, v8}, [x2], #4
+  ld4r.4h {v6, v7, v8, v9}, [x2], #8
+  ld4r.8h {v1, v2, v3, v4}, [x2], #8
+  ld4r.2s {v2, v3, v4, v5}, [x2], #16
+  ld4r.4s {v3, v4, v5, v6}, [x2], #16
+  ld4r.1d {v0, v1, v2, v3}, [x2], #32
+  ld4r.2d {v4, v5, v6, v7}, [x2], #32
+
+; CHECK: ld4r:
+; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe0,0x60,0x0d]
+; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe0,0xe3,0x0d]
+; CHECK: ld4r.16b { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe0,0x60,0x4d]
+; CHECK: ld4r.16b { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe0,0xe3,0x4d]
+; CHECK: ld4r.4h { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe4,0x60,0x0d]
+; CHECK: ld4r.4h { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe4,0xe3,0x0d]
+; CHECK: ld4r.8h { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe4,0x60,0x4d]
+; CHECK: ld4r.8h { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe4,0xe3,0x4d]
+; CHECK: ld4r.2s { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe8,0x60,0x0d]
+; CHECK: ld4r.2s { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe8,0xe3,0x0d]
+; CHECK: ld4r.4s { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xe8,0x60,0x4d]
+; CHECK: ld4r.4s { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xe8,0xe3,0x4d]
+; CHECK: ld4r.1d { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xec,0x60,0x0d]
+; CHECK: ld4r.1d { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xec,0xe3,0x0d]
+; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2] ; encoding: [0x44,0xec,0x60,0x4d]
+; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2], x3 ; encoding: [0x44,0xec,0xe3,0x4d]
+
+; CHECK: ld4r.8b { v4, v5, v6, v7 }, [x2], #4 ; encoding: [0x44,0xe0,0xff,0x0d]
+; CHECK: ld4r.16b { v5, v6, v7, v8 }, [x2], #4 ; encoding: [0x45,0xe0,0xff,0x4d]
+; CHECK: ld4r.4h { v6, v7, v8, v9 }, [x2], #8 ; encoding: [0x46,0xe4,0xff,0x0d]
+; CHECK: ld4r.8h { v1, v2, v3, v4 }, [x2], #8 ; encoding: [0x41,0xe4,0xff,0x4d]
+; CHECK: ld4r.2s { v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x0d]
+; CHECK: ld4r.4s { v3, v4, v5, v6 }, [x2], #16 ; encoding: [0x43,0xe8,0xff,0x4d]
+; CHECK: ld4r.1d { v0, v1, v2, v3 }, [x2], #32 ; encoding: [0x40,0xec,0xff,0x0d]
+; CHECK: ld4r.2d { v4, v5, v6, v7 }, [x2], #32 ; encoding: [0x44,0xec,0xff,0x4d]
+
+
+_ld1:
+  ld1.b {v4}[13], [x3]
+  ld1.h {v4}[2], [x3]
+  ld1.s {v4}[2], [x3]
+  ld1.d {v4}[1], [x3]
+  ld1.b {v4}[13], [x3], x5
+  ld1.h {v4}[2], [x3], x5
+  ld1.s {v4}[2], [x3], x5
+  ld1.d {v4}[1], [x3], x5
+  ld1.b {v4}[13], [x3], #1
+  ld1.h {v4}[2], [x3], #2
+  ld1.s {v4}[2], [x3], #4
+  ld1.d {v4}[1], [x3], #8
+
+; CHECK: _ld1:
+; CHECK: ld1.b { v4 }[13], [x3]        ; encoding: [0x64,0x14,0x40,0x4d]
+; CHECK: ld1.h { v4 }[2], [x3]         ; encoding: [0x64,0x50,0x40,0x0d]
+; CHECK: ld1.s { v4 }[2], [x3]         ; encoding: [0x64,0x80,0x40,0x4d]
+; CHECK: ld1.d { v4 }[1], [x3]         ; encoding: [0x64,0x84,0x40,0x4d]
+; CHECK: ld1.b { v4 }[13], [x3], x5    ; encoding: [0x64,0x14,0xc5,0x4d]
+; CHECK: ld1.h { v4 }[2], [x3], x5     ; encoding: [0x64,0x50,0xc5,0x0d]
+; CHECK: ld1.s { v4 }[2], [x3], x5     ; encoding: [0x64,0x80,0xc5,0x4d]
+; CHECK: ld1.d { v4 }[1], [x3], x5     ; encoding: [0x64,0x84,0xc5,0x4d]
+; CHECK: ld1.b { v4 }[13], [x3], #1   ; encoding: [0x64,0x14,0xdf,0x4d]
+; CHECK: ld1.h { v4 }[2], [x3], #2    ; encoding: [0x64,0x50,0xdf,0x0d]
+; CHECK: ld1.s { v4 }[2], [x3], #4    ; encoding: [0x64,0x80,0xdf,0x4d]
+; CHECK: ld1.d { v4 }[1], [x3], #8    ; encoding: [0x64,0x84,0xdf,0x4d]
+
+_ld2:
+  ld2.b {v4, v5}[13], [x3]
+  ld2.h {v4, v5}[2], [x3]
+  ld2.s {v4, v5}[2], [x3]
+  ld2.d {v4, v5}[1], [x3]
+  ld2.b {v4, v5}[13], [x3], x5
+  ld2.h {v4, v5}[2], [x3], x5
+  ld2.s {v4, v5}[2], [x3], x5
+  ld2.d {v4, v5}[1], [x3], x5
+  ld2.b {v4, v5}[13], [x3], #2
+  ld2.h {v4, v5}[2], [x3], #4
+  ld2.s {v4, v5}[2], [x3], #8
+  ld2.d {v4, v5}[1], [x3], #16
+
+
+; CHECK: _ld2:
+; CHECK: ld2.b { v4, v5 }[13], [x3]    ; encoding: [0x64,0x14,0x60,0x4d]
+; CHECK: ld2.h { v4, v5 }[2], [x3]     ; encoding: [0x64,0x50,0x60,0x0d]
+; CHECK: ld2.s { v4, v5 }[2], [x3]     ; encoding: [0x64,0x80,0x60,0x4d]
+; CHECK: ld2.d { v4, v5 }[1], [x3]     ; encoding: [0x64,0x84,0x60,0x4d]
+; CHECK: ld2.b { v4, v5 }[13], [x3], x5 ; encoding: [0x64,0x14,0xe5,0x4d]
+; CHECK: ld2.h { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x50,0xe5,0x0d]
+; CHECK: ld2.s { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x80,0xe5,0x4d]
+; CHECK: ld2.d { v4, v5 }[1], [x3], x5 ; encoding: [0x64,0x84,0xe5,0x4d]
+; CHECK: ld2.b { v4, v5 }[13], [x3], #2 ; encoding: [0x64,0x14,0xff,0x4d]
+; CHECK: ld2.h { v4, v5 }[2], [x3], #4 ; encoding: [0x64,0x50,0xff,0x0d]
+; CHECK: ld2.s { v4, v5 }[2], [x3], #8 ; encoding: [0x64,0x80,0xff,0x4d]
+; CHECK: ld2.d { v4, v5 }[1], [x3], #16 ; encoding: [0x64,0x84,0xff,0x4d]
+
+
+_ld3:
+  ld3.b {v4, v5, v6}[13], [x3]
+  ld3.h {v4, v5, v6}[2], [x3]
+  ld3.s {v4, v5, v6}[2], [x3]
+  ld3.d {v4, v5, v6}[1], [x3]
+  ld3.b {v4, v5, v6}[13], [x3], x5
+  ld3.h {v4, v5, v6}[2], [x3], x5
+  ld3.s {v4, v5, v6}[2], [x3], x5
+  ld3.d {v4, v5, v6}[1], [x3], x5
+  ld3.b {v4, v5, v6}[13], [x3], #3
+  ld3.h {v4, v5, v6}[2], [x3], #6
+  ld3.s {v4, v5, v6}[2], [x3], #12
+  ld3.d {v4, v5, v6}[1], [x3], #24
+
+
+; CHECK: _ld3:
+; CHECK: ld3.b { v4, v5, v6 }[13], [x3] ; encoding: [0x64,0x34,0x40,0x4d]
+; CHECK: ld3.h { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0x70,0x40,0x0d]
+; CHECK: ld3.s { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0xa0,0x40,0x4d]
+; CHECK: ld3.d { v4, v5, v6 }[1], [x3] ; encoding: [0x64,0xa4,0x40,0x4d]
+; CHECK: ld3.b { v4, v5, v6 }[13], [x3], x5 ; encoding: [0x64,0x34,0xc5,0x4d]
+; CHECK: ld3.h { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0x70,0xc5,0x0d]
+; CHECK: ld3.s { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xc5,0x4d]
+; CHECK: ld3.d { v4, v5, v6 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xc5,0x4d]
+; CHECK: ld3.b { v4, v5, v6 }[13], [x3], #3 ; encoding: [0x64,0x34,0xdf,0x4d]
+; CHECK: ld3.h { v4, v5, v6 }[2], [x3], #6 ; encoding: [0x64,0x70,0xdf,0x0d]
+; CHECK: ld3.s { v4, v5, v6 }[2], [x3], #12 ; encoding: [0x64,0xa0,0xdf,0x4d]
+; CHECK: ld3.d { v4, v5, v6 }[1], [x3], #24 ; encoding: [0x64,0xa4,0xdf,0x4d]
+
+
+_ld4:
+  ld4.b {v4, v5, v6, v7}[13], [x3]
+  ld4.h {v4, v5, v6, v7}[2], [x3]
+  ld4.s {v4, v5, v6, v7}[2], [x3]
+  ld4.d {v4, v5, v6, v7}[1], [x3]
+  ld4.b {v4, v5, v6, v7}[13], [x3], x5
+  ld4.h {v4, v5, v6, v7}[2], [x3], x5
+  ld4.s {v4, v5, v6, v7}[2], [x3], x5
+  ld4.d {v4, v5, v6, v7}[1], [x3], x5
+  ld4.b {v4, v5, v6, v7}[13], [x3], #4
+  ld4.h {v4, v5, v6, v7}[2], [x3], #8
+  ld4.s {v4, v5, v6, v7}[2], [x3], #16
+  ld4.d {v4, v5, v6, v7}[1], [x3], #32
+
+; CHECK: _ld4:
+; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3] ; encoding: [0x64,0x34,0x60,0x4d]
+; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0x70,0x60,0x0d]
+; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0xa0,0x60,0x4d]
+; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3] ; encoding: [0x64,0xa4,0x60,0x4d]
+; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3], x5 ; encoding: [0x64,0x34,0xe5,0x4d]
+; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0x70,0xe5,0x0d]
+; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xe5,0x4d]
+; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xe5,0x4d]
+; CHECK: ld4.b { v4, v5, v6, v7 }[13], [x3], #4 ; encoding: [0x64,0x34,0xff,0x4d]
+; CHECK: ld4.h { v4, v5, v6, v7 }[2], [x3], #8 ; encoding: [0x64,0x70,0xff,0x0d]
+; CHECK: ld4.s { v4, v5, v6, v7 }[2], [x3], #16 ; encoding: [0x64,0xa0,0xff,0x4d]
+; CHECK: ld4.d { v4, v5, v6, v7 }[1], [x3], #32 ; encoding: [0x64,0xa4,0xff,0x4d]
+
+_st1:
+  st1.b {v4}[13], [x3]
+  st1.h {v4}[2], [x3]
+  st1.s {v4}[2], [x3]
+  st1.d {v4}[1], [x3]
+  st1.b {v4}[13], [x3], x5
+  st1.h {v4}[2], [x3], x5
+  st1.s {v4}[2], [x3], x5
+  st1.d {v4}[1], [x3], x5
+  st1.b {v4}[13], [x3], #1
+  st1.h {v4}[2], [x3], #2
+  st1.s {v4}[2], [x3], #4
+  st1.d {v4}[1], [x3], #8
+
+; CHECK: _st1:
+; CHECK: st1.b { v4 }[13], [x3]        ; encoding: [0x64,0x14,0x00,0x4d]
+; CHECK: st1.h { v4 }[2], [x3]         ; encoding: [0x64,0x50,0x00,0x0d]
+; CHECK: st1.s { v4 }[2], [x3]         ; encoding: [0x64,0x80,0x00,0x4d]
+; CHECK: st1.d { v4 }[1], [x3]         ; encoding: [0x64,0x84,0x00,0x4d]
+; CHECK: st1.b { v4 }[13], [x3], x5    ; encoding: [0x64,0x14,0x85,0x4d]
+; CHECK: st1.h { v4 }[2], [x3], x5     ; encoding: [0x64,0x50,0x85,0x0d]
+; CHECK: st1.s { v4 }[2], [x3], x5     ; encoding: [0x64,0x80,0x85,0x4d]
+; CHECK: st1.d { v4 }[1], [x3], x5     ; encoding: [0x64,0x84,0x85,0x4d]
+; CHECK: st1.b { v4 }[13], [x3], #1   ; encoding: [0x64,0x14,0x9f,0x4d]
+; CHECK: st1.h { v4 }[2], [x3], #2    ; encoding: [0x64,0x50,0x9f,0x0d]
+; CHECK: st1.s { v4 }[2], [x3], #4    ; encoding: [0x64,0x80,0x9f,0x4d]
+; CHECK: st1.d { v4 }[1], [x3], #8    ; encoding: [0x64,0x84,0x9f,0x4d]
+
+_st2:
+  st2.b {v4, v5}[13], [x3]
+  st2.h {v4, v5}[2], [x3]
+  st2.s {v4, v5}[2], [x3]
+  st2.d {v4, v5}[1], [x3]
+  st2.b {v4, v5}[13], [x3], x5
+  st2.h {v4, v5}[2], [x3], x5
+  st2.s {v4, v5}[2], [x3], x5
+  st2.d {v4, v5}[1], [x3], x5
+  st2.b {v4, v5}[13], [x3], #2
+  st2.h {v4, v5}[2], [x3], #4
+  st2.s {v4, v5}[2], [x3], #8
+  st2.d {v4, v5}[1], [x3], #16
+
+; CHECK: _st2:
+; CHECK: st2.b { v4, v5 }[13], [x3]    ; encoding: [0x64,0x14,0x20,0x4d]
+; CHECK: st2.h { v4, v5 }[2], [x3]     ; encoding: [0x64,0x50,0x20,0x0d]
+; CHECK: st2.s { v4, v5 }[2], [x3]     ; encoding: [0x64,0x80,0x20,0x4d]
+; CHECK: st2.d { v4, v5 }[1], [x3]     ; encoding: [0x64,0x84,0x20,0x4d]
+; CHECK: st2.b { v4, v5 }[13], [x3], x5 ; encoding: [0x64,0x14,0xa5,0x4d]
+; CHECK: st2.h { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x50,0xa5,0x0d]
+; CHECK: st2.s { v4, v5 }[2], [x3], x5 ; encoding: [0x64,0x80,0xa5,0x4d]
+; CHECK: st2.d { v4, v5 }[1], [x3], x5 ; encoding: [0x64,0x84,0xa5,0x4d]
+; CHECK: st2.b { v4, v5 }[13], [x3], #2 ; encoding: [0x64,0x14,0xbf,0x4d]
+; CHECK: st2.h { v4, v5 }[2], [x3], #4 ; encoding: [0x64,0x50,0xbf,0x0d]
+; CHECK: st2.s { v4, v5 }[2], [x3], #8 ; encoding: [0x64,0x80,0xbf,0x4d]
+; CHECK: st2.d { v4, v5 }[1], [x3], #16 ; encoding: [0x64,0x84,0xbf,0x4d]
+
+
+_st3:
+  st3.b {v4, v5, v6}[13], [x3]
+  st3.h {v4, v5, v6}[2], [x3]
+  st3.s {v4, v5, v6}[2], [x3]
+  st3.d {v4, v5, v6}[1], [x3]
+  st3.b {v4, v5, v6}[13], [x3], x5
+  st3.h {v4, v5, v6}[2], [x3], x5
+  st3.s {v4, v5, v6}[2], [x3], x5
+  st3.d {v4, v5, v6}[1], [x3], x5
+  st3.b {v4, v5, v6}[13], [x3], #3
+  st3.h {v4, v5, v6}[2], [x3], #6
+  st3.s {v4, v5, v6}[2], [x3], #12
+  st3.d {v4, v5, v6}[1], [x3], #24
+
+; CHECK: _st3:
+; CHECK: st3.b { v4, v5, v6 }[13], [x3] ; encoding: [0x64,0x34,0x00,0x4d]
+; CHECK: st3.h { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0x70,0x00,0x0d]
+; CHECK: st3.s { v4, v5, v6 }[2], [x3] ; encoding: [0x64,0xa0,0x00,0x4d]
+; CHECK: st3.d { v4, v5, v6 }[1], [x3] ; encoding: [0x64,0xa4,0x00,0x4d]
+; CHECK: st3.b { v4, v5, v6 }[13], [x3], x5 ; encoding: [0x64,0x34,0x85,0x4d]
+; CHECK: st3.h { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0x70,0x85,0x0d]
+; CHECK: st3.s { v4, v5, v6 }[2], [x3], x5 ; encoding: [0x64,0xa0,0x85,0x4d]
+; CHECK: st3.d { v4, v5, v6 }[1], [x3], x5 ; encoding: [0x64,0xa4,0x85,0x4d]
+; CHECK: st3.b { v4, v5, v6 }[13], [x3], #3 ; encoding: [0x64,0x34,0x9f,0x4d]
+; CHECK: st3.h { v4, v5, v6 }[2], [x3], #6 ; encoding: [0x64,0x70,0x9f,0x0d]
+; CHECK: st3.s { v4, v5, v6 }[2], [x3], #12 ; encoding: [0x64,0xa0,0x9f,0x4d]
+; CHECK: st3.d { v4, v5, v6 }[1], [x3], #24 ; encoding: [0x64,0xa4,0x9f,0x4d]
+
+_st4:
+  st4.b {v4, v5, v6, v7}[13], [x3]
+  st4.h {v4, v5, v6, v7}[2], [x3]
+  st4.s {v4, v5, v6, v7}[2], [x3]
+  st4.d {v4, v5, v6, v7}[1], [x3]
+  st4.b {v4, v5, v6, v7}[13], [x3], x5
+  st4.h {v4, v5, v6, v7}[2], [x3], x5
+  st4.s {v4, v5, v6, v7}[2], [x3], x5
+  st4.d {v4, v5, v6, v7}[1], [x3], x5
+  st4.b {v4, v5, v6, v7}[13], [x3], #4
+  st4.h {v4, v5, v6, v7}[2], [x3], #8
+  st4.s {v4, v5, v6, v7}[2], [x3], #16
+  st4.d {v4, v5, v6, v7}[1], [x3], #32
+
+; CHECK: _st4:
+; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3] ; encoding: [0x64,0x34,0x20,0x4d]
+; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0x70,0x20,0x0d]
+; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3] ; encoding: [0x64,0xa0,0x20,0x4d]
+; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3] ; encoding: [0x64,0xa4,0x20,0x4d]
+; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3], x5 ; encoding: [0x64,0x34,0xa5,0x4d]
+; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0x70,0xa5,0x0d]
+; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3], x5 ; encoding: [0x64,0xa0,0xa5,0x4d]
+; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3], x5 ; encoding: [0x64,0xa4,0xa5,0x4d]
+; CHECK: st4.b { v4, v5, v6, v7 }[13], [x3], #4 ; encoding: [0x64,0x34,0xbf,0x4d]
+; CHECK: st4.h { v4, v5, v6, v7 }[2], [x3], #8 ; encoding: [0x64,0x70,0xbf,0x0d]
+; CHECK: st4.s { v4, v5, v6, v7 }[2], [x3], #16 ; encoding: [0x64,0xa0,0xbf,0x4d]
+; CHECK: st4.d { v4, v5, v6, v7 }[1], [x3], #32 ; encoding: [0x64,0xa4,0xbf,0x4d]
+
+
+;---------
+; ARM verbose syntax equivalents to the above.
+;---------
+verbose_syntax:
+
+  ld1 { v1.8b }, [x1]
+  ld1 { v2.8b, v3.8b }, [x1]
+  ld1 { v3.8b, v4.8b, v5.8b }, [x1]
+  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1]
+
+  ld1 { v1.16b }, [x1]
+  ld1 { v2.16b, v3.16b }, [x1]
+  ld1 { v3.16b, v4.16b, v5.16b }, [x1]
+  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1]
+
+  ld1 { v1.4h }, [x1]
+  ld1 { v2.4h, v3.4h }, [x1]
+  ld1 { v3.4h, v4.4h, v5.4h }, [x1]
+  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1]
+
+  ld1 { v1.8h }, [x1]
+  ld1 { v2.8h, v3.8h }, [x1]
+  ld1 { v3.8h, v4.8h, v5.8h }, [x1]
+  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1]
+
+  ld1 { v1.2s }, [x1]
+  ld1 { v2.2s, v3.2s }, [x1]
+  ld1 { v3.2s, v4.2s, v5.2s }, [x1]
+  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1]
+
+  ld1 { v1.4s }, [x1]
+  ld1 { v2.4s, v3.4s }, [x1]
+  ld1 { v3.4s, v4.4s, v5.4s }, [x1]
+  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1]
+
+  ld1 { v1.1d }, [x1]
+  ld1 { v2.1d, v3.1d }, [x1]
+  ld1 { v3.1d, v4.1d, v5.1d }, [x1]
+  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1]
+
+  ld1 { v1.2d }, [x1]
+  ld1 { v2.2d, v3.2d }, [x1]
+  ld1 { v3.2d, v4.2d, v5.2d }, [x1]
+  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1]
+
+  st1 { v1.8b }, [x1]
+  st1 { v2.8b, v3.8b }, [x1]
+  st1 { v3.8b, v4.8b, v5.8b }, [x1]
+  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1]
+
+  st1 { v1.16b }, [x1]
+  st1 { v2.16b, v3.16b }, [x1]
+  st1 { v3.16b, v4.16b, v5.16b }, [x1]
+  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1]
+
+  st1 { v1.4h }, [x1]
+  st1 { v2.4h, v3.4h }, [x1]
+  st1 { v3.4h, v4.4h, v5.4h }, [x1]
+  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1]
+
+  st1 { v1.8h }, [x1]
+  st1 { v2.8h, v3.8h }, [x1]
+  st1 { v3.8h, v4.8h, v5.8h }, [x1]
+  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1]
+
+  st1 { v1.2s }, [x1]
+  st1 { v2.2s, v3.2s }, [x1]
+  st1 { v3.2s, v4.2s, v5.2s }, [x1]
+  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1]
+
+  st1 { v1.4s }, [x1]
+  st1 { v2.4s, v3.4s }, [x1]
+  st1 { v3.4s, v4.4s, v5.4s }, [x1]
+  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1]
+
+  st1 { v1.1d }, [x1]
+  st1 { v2.1d, v3.1d }, [x1]
+  st1 { v3.1d, v4.1d, v5.1d }, [x1]
+  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1]
+
+  st1 { v1.2d }, [x1]
+  st1 { v2.2d, v3.2d }, [x1]
+  st1 { v3.2d, v4.2d, v5.2d }, [x1]
+  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1]
+
+  ld2 { v3.8b, v4.8b }, [x19]
+  ld2 { v3.16b, v4.16b }, [x19]
+  ld2 { v3.4h, v4.4h }, [x19]
+  ld2 { v3.8h, v4.8h }, [x19]
+  ld2 { v3.2s, v4.2s }, [x19]
+  ld2 { v3.4s, v4.4s }, [x19]
+  ld2 { v3.2d, v4.2d }, [x19]
+
+  st2 { v3.8b, v4.8b }, [x19]
+  st2 { v3.16b, v4.16b }, [x19]
+  st2 { v3.4h, v4.4h }, [x19]
+  st2 { v3.8h, v4.8h }, [x19]
+  st2 { v3.2s, v4.2s }, [x19]
+  st2 { v3.4s, v4.4s }, [x19]
+  st2 { v3.2d, v4.2d }, [x19]
+
+  ld3 { v2.8b, v3.8b, v4.8b }, [x19]
+  ld3 { v2.16b, v3.16b, v4.16b }, [x19]
+  ld3 { v2.4h, v3.4h, v4.4h }, [x19]
+  ld3 { v2.8h, v3.8h, v4.8h }, [x19]
+  ld3 { v2.2s, v3.2s, v4.2s }, [x19]
+  ld3 { v2.4s, v3.4s, v4.4s }, [x19]
+  ld3 { v2.2d, v3.2d, v4.2d }, [x19]
+
+  st3 { v2.8b, v3.8b, v4.8b }, [x19]
+  st3 { v2.16b, v3.16b, v4.16b }, [x19]
+  st3 { v2.4h, v3.4h, v4.4h }, [x19]
+  st3 { v2.8h, v3.8h, v4.8h }, [x19]
+  st3 { v2.2s, v3.2s, v4.2s }, [x19]
+  st3 { v2.4s, v3.4s, v4.4s }, [x19]
+  st3 { v2.2d, v3.2d, v4.2d }, [x19]
+
+  ld4 { v2.8b, v3.8b, v4.8b, v5.8b }, [x19]
+  ld4 { v2.16b, v3.16b, v4.16b, v5.16b }, [x19]
+  ld4 { v2.4h, v3.4h, v4.4h, v5.4h }, [x19]
+  ld4 { v2.8h, v3.8h, v4.8h, v5.8h }, [x19]
+  ld4 { v2.2s, v3.2s, v4.2s, v5.2s }, [x19]
+  ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x19]
+  ld4 { v2.2d, v3.2d, v4.2d, v5.2d }, [x19]
+
+  st4 { v2.8b, v3.8b, v4.8b, v5.8b }, [x19]
+  st4 { v2.16b, v3.16b, v4.16b, v5.16b }, [x19]
+  st4 { v2.4h, v3.4h, v4.4h, v5.4h }, [x19]
+  st4 { v2.8h, v3.8h, v4.8h, v5.8h }, [x19]
+  st4 { v2.2s, v3.2s, v4.2s, v5.2s }, [x19]
+  st4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x19]
+  st4 { v2.2d, v3.2d, v4.2d, v5.2d }, [x19]
+
+  ld1 { v1.8b }, [x1], x15
+  ld1 { v2.8b, v3.8b }, [x1], x15
+  ld1 { v3.8b, v4.8b, v5.8b }, [x1], x15
+  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
+
+  ld1 { v1.16b }, [x1], x15
+  ld1 { v2.16b, v3.16b }, [x1], x15
+  ld1 { v3.16b, v4.16b, v5.16b }, [x1], x15
+  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
+
+  ld1 { v1.4h }, [x1], x15
+  ld1 { v2.4h, v3.4h }, [x1], x15
+  ld1 { v3.4h, v4.4h, v5.4h }, [x1], x15
+  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
+
+  ld1 { v1.8h }, [x1], x15
+  ld1 { v2.8h, v3.8h }, [x1], x15
+  ld1 { v3.8h, v4.8h, v5.8h }, [x1], x15
+  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
+
+  ld1 { v1.2s }, [x1], x15
+  ld1 { v2.2s, v3.2s }, [x1], x15
+  ld1 { v3.2s, v4.2s, v5.2s }, [x1], x15
+  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
+
+  ld1 { v1.4s }, [x1], x15
+  ld1 { v2.4s, v3.4s }, [x1], x15
+  ld1 { v3.4s, v4.4s, v5.4s }, [x1], x15
+  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
+
+  ld1 { v1.1d }, [x1], x15
+  ld1 { v2.1d, v3.1d }, [x1], x15
+  ld1 { v3.1d, v4.1d, v5.1d }, [x1], x15
+  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], x15
+
+  ld1 { v1.2d }, [x1], x15
+  ld1 { v2.2d, v3.2d }, [x1], x15
+  ld1 { v3.2d, v4.2d, v5.2d }, [x1], x15
+  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
+
+  st1 { v1.8b }, [x1], x15
+  st1 { v2.8b, v3.8b }, [x1], x15
+  st1 { v3.8b, v4.8b, v5.8b }, [x1], x15
+  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
+
+  st1 { v1.16b }, [x1], x15
+  st1 { v2.16b, v3.16b }, [x1], x15
+  st1 { v3.16b, v4.16b, v5.16b }, [x1], x15
+  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
+
+  st1 { v1.4h }, [x1], x15
+  st1 { v2.4h, v3.4h }, [x1], x15
+  st1 { v3.4h, v4.4h, v5.4h }, [x1], x15
+  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
+
+  st1 { v1.8h }, [x1], x15
+  st1 { v2.8h, v3.8h }, [x1], x15
+  st1 { v3.8h, v4.8h, v5.8h }, [x1], x15
+  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
+
+  st1 { v1.2s }, [x1], x15
+  st1 { v2.2s, v3.2s }, [x1], x15
+  st1 { v3.2s, v4.2s, v5.2s }, [x1], x15
+  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
+
+  st1 { v1.4s }, [x1], x15
+  st1 { v2.4s, v3.4s }, [x1], x15
+  st1 { v3.4s, v4.4s, v5.4s }, [x1], x15
+  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
+
+  st1 { v1.1d }, [x1], x15
+  st1 { v2.1d, v3.1d }, [x1], x15
+  st1 { v3.1d, v4.1d, v5.1d }, [x1], x15
+  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], x15
+
+  st1 { v1.2d }, [x1], x15
+  st1 { v2.2d, v3.2d }, [x1], x15
+  st1 { v3.2d, v4.2d, v5.2d }, [x1], x15
+  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
+
+  ld1 { v1.8b }, [x1], #8
+  ld1 { v2.8b, v3.8b }, [x1], #16
+  ld1 { v3.8b, v4.8b, v5.8b }, [x1], #24
+  ld1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
+
+  ld1 { v1.16b }, [x1], #16
+  ld1 { v2.16b, v3.16b }, [x1], #32
+  ld1 { v3.16b, v4.16b, v5.16b }, [x1], #48
+  ld1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
+
+  ld1 { v1.4h }, [x1], #8
+  ld1 { v2.4h, v3.4h }, [x1], #16
+  ld1 { v3.4h, v4.4h, v5.4h }, [x1], #24
+  ld1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
+
+  ld1 { v1.8h }, [x1], #16
+  ld1 { v2.8h, v3.8h }, [x1], #32
+  ld1 { v3.8h, v4.8h, v5.8h }, [x1], #48
+  ld1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
+
+  ld1 { v1.2s }, [x1], #8
+  ld1 { v2.2s, v3.2s }, [x1], #16
+  ld1 { v3.2s, v4.2s, v5.2s }, [x1], #24
+  ld1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
+
+  ld1 { v1.4s }, [x1], #16
+  ld1 { v2.4s, v3.4s }, [x1], #32
+  ld1 { v3.4s, v4.4s, v5.4s }, [x1], #48
+  ld1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
+
+  ld1 { v1.1d }, [x1], #8
+  ld1 { v2.1d, v3.1d }, [x1], #16
+  ld1 { v3.1d, v4.1d, v5.1d }, [x1], #24
+  ld1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], #32
+
+  ld1 { v1.2d }, [x1], #16
+  ld1 { v2.2d, v3.2d }, [x1], #32
+  ld1 { v3.2d, v4.2d, v5.2d }, [x1], #48
+  ld1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
+
+  st1 { v1.8b }, [x1], #8
+  st1 { v2.8b, v3.8b }, [x1], #16
+  st1 { v3.8b, v4.8b, v5.8b }, [x1], #24
+  st1 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
+
+  st1 { v1.16b }, [x1], #16
+  st1 { v2.16b, v3.16b }, [x1], #32
+  st1 { v3.16b, v4.16b, v5.16b }, [x1], #48
+  st1 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
+
+  st1 { v1.4h }, [x1], #8
+  st1 { v2.4h, v3.4h }, [x1], #16
+  st1 { v3.4h, v4.4h, v5.4h }, [x1], #24
+  st1 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
+
+  st1 { v1.8h }, [x1], #16
+  st1 { v2.8h, v3.8h }, [x1], #32
+  st1 { v3.8h, v4.8h, v5.8h }, [x1], #48
+  st1 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
+
+  st1 { v1.2s }, [x1], #8
+  st1 { v2.2s, v3.2s }, [x1], #16
+  st1 { v3.2s, v4.2s, v5.2s }, [x1], #24
+  st1 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
+
+  st1 { v1.4s }, [x1], #16
+  st1 { v2.4s, v3.4s }, [x1], #32
+  st1 { v3.4s, v4.4s, v5.4s }, [x1], #48
+  st1 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
+
+  st1 { v1.1d }, [x1], #8
+  st1 { v2.1d, v3.1d }, [x1], #16
+  st1 { v3.1d, v4.1d, v5.1d }, [x1], #24
+  st1 { v7.1d, v8.1d, v9.1d, v10.1d }, [x1], #32
+
+  st1 { v1.2d }, [x1], #16
+  st1 { v2.2d, v3.2d }, [x1], #32
+  st1 { v3.2d, v4.2d, v5.2d }, [x1], #48
+  st1 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
+
+  ld2 { v2.8b, v3.8b }, [x1], x15
+  ld2 { v2.16b, v3.16b }, [x1], x15
+  ld2 { v2.4h, v3.4h }, [x1], x15
+  ld2 { v2.8h, v3.8h }, [x1], x15
+  ld2 { v2.2s, v3.2s }, [x1], x15
+  ld2 { v2.4s, v3.4s }, [x1], x15
+  ld2 { v2.2d, v3.2d }, [x1], x15
+
+  st2 { v2.8b, v3.8b }, [x1], x15
+  st2 { v2.16b, v3.16b }, [x1], x15
+  st2 { v2.4h, v3.4h }, [x1], x15
+  st2 { v2.8h, v3.8h }, [x1], x15
+  st2 { v2.2s, v3.2s }, [x1], x15
+  st2 { v2.4s, v3.4s }, [x1], x15
+  st2 { v2.2d, v3.2d }, [x1], x15
+
+  ld2 { v2.8b, v3.8b }, [x1], #16
+  ld2 { v2.16b, v3.16b }, [x1], #32
+  ld2 { v2.4h, v3.4h }, [x1], #16
+  ld2 { v2.8h, v3.8h }, [x1], #32
+  ld2 { v2.2s, v3.2s }, [x1], #16
+  ld2 { v2.4s, v3.4s }, [x1], #32
+  ld2 { v2.2d, v3.2d }, [x1], #32
+
+  st2 { v2.8b, v3.8b }, [x1], #16
+  st2 { v2.16b, v3.16b }, [x1], #32
+  st2 { v2.4h, v3.4h }, [x1], #16
+  st2 { v2.8h, v3.8h }, [x1], #32
+  st2 { v2.2s, v3.2s }, [x1], #16
+  st2 { v2.4s, v3.4s }, [x1], #32
+  st2 { v2.2d, v3.2d }, [x1], #32
+
+  ld3 { v3.8b, v4.8b, v5.8b }, [x1], x15
+  ld3 { v3.16b, v4.16b, v5.16b }, [x1], x15
+  ld3 { v3.4h, v4.4h, v5.4h }, [x1], x15
+  ld3 { v3.8h, v4.8h, v5.8h }, [x1], x15
+  ld3 { v3.2s, v4.2s, v5.2s }, [x1], x15
+  ld3 { v3.4s, v4.4s, v5.4s }, [x1], x15
+  ld3 { v3.2d, v4.2d, v5.2d }, [x1], x15
+
+  st3 { v3.8b, v4.8b, v5.8b }, [x1], x15
+  st3 { v3.16b, v4.16b, v5.16b }, [x1], x15
+  st3 { v3.4h, v4.4h, v5.4h }, [x1], x15
+  st3 { v3.8h, v4.8h, v5.8h }, [x1], x15
+  st3 { v3.2s, v4.2s, v5.2s }, [x1], x15
+  st3 { v3.4s, v4.4s, v5.4s }, [x1], x15
+  st3 { v3.2d, v4.2d, v5.2d }, [x1], x15
+  ld3 { v3.8b, v4.8b, v5.8b }, [x1], #24
+
+  ld3 { v3.16b, v4.16b, v5.16b }, [x1], #48
+  ld3 { v3.4h, v4.4h, v5.4h }, [x1], #24
+  ld3 { v3.8h, v4.8h, v5.8h }, [x1], #48
+  ld3 { v3.2s, v4.2s, v5.2s }, [x1], #24
+  ld3 { v3.4s, v4.4s, v5.4s }, [x1], #48
+  ld3 { v3.2d, v4.2d, v5.2d }, [x1], #48
+
+  st3 { v3.8b, v4.8b, v5.8b }, [x1], #24
+  st3 { v3.16b, v4.16b, v5.16b }, [x1], #48
+  st3 { v3.4h, v4.4h, v5.4h }, [x1], #24
+  st3 { v3.8h, v4.8h, v5.8h }, [x1], #48
+  st3 { v3.2s, v4.2s, v5.2s }, [x1], #24
+  st3 { v3.4s, v4.4s, v5.4s }, [x1], #48
+  st3 { v3.2d, v4.2d, v5.2d }, [x1], #48
+
+  ld4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
+  ld4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
+  ld4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
+  ld4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
+  ld4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
+  ld4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
+  ld4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
+
+  st4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], x15
+  st4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], x15
+  st4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], x15
+  st4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], x15
+  st4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], x15
+  st4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], x15
+  st4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], x15
+
+  ld4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
+  ld4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
+  ld4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
+  ld4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
+  ld4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
+  ld4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
+  ld4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
+
+  st4 { v4.8b, v5.8b, v6.8b, v7.8b }, [x1], #32
+  st4 { v4.16b, v5.16b, v6.16b, v7.16b }, [x1], #64
+  st4 { v7.4h, v8.4h, v9.4h, v10.4h }, [x1], #32
+  st4 { v7.8h, v8.8h, v9.8h, v10.8h }, [x1], #64
+  st4 { v7.2s, v8.2s, v9.2s, v10.2s }, [x1], #32
+  st4 { v7.4s, v8.4s, v9.4s, v10.4s }, [x1], #64
+  st4 { v7.2d, v8.2d, v9.2d, v10.2d }, [x1], #64
+
+
+  ld1r { v12.8b }, [x2]
+  ld1r { v12.8b }, [x2], x3
+  ld1r { v12.16b }, [x2]
+  ld1r { v12.16b }, [x2], x3
+  ld1r { v12.4h }, [x2]
+  ld1r { v12.4h }, [x2], x3
+  ld1r { v12.8h }, [x2]
+  ld1r { v12.8h }, [x2], x3
+  ld1r { v12.2s }, [x2]
+  ld1r { v12.2s }, [x2], x3
+  ld1r { v12.4s }, [x2]
+  ld1r { v12.4s }, [x2], x3
+  ld1r { v12.1d }, [x2]
+  ld1r { v12.1d }, [x2], x3
+  ld1r { v12.2d }, [x2]
+  ld1r { v12.2d }, [x2], x3
+
+  ld1r { v12.8b }, [x2], #1
+  ld1r { v12.16b }, [x2], #1
+  ld1r { v12.4h }, [x2], #2
+  ld1r { v12.8h }, [x2], #2
+  ld1r { v12.2s }, [x2], #4
+  ld1r { v12.4s }, [x2], #4
+  ld1r { v12.1d }, [x2], #8
+  ld1r { v12.2d }, [x2], #8
+  ld2r { v3.8b, v4.8b }, [x2]
+  ld2r { v3.8b, v4.8b }, [x2], x3
+  ld2r { v3.16b, v4.16b }, [x2]
+  ld2r { v3.16b, v4.16b }, [x2], x3
+  ld2r { v3.4h, v4.4h }, [x2]
+  ld2r { v3.4h, v4.4h }, [x2], x3
+  ld2r { v3.8h, v4.8h }, [x2]
+  ld2r { v3.8h, v4.8h }, [x2], x3
+  ld2r { v3.2s, v4.2s }, [x2]
+  ld2r { v3.2s, v4.2s }, [x2], x3
+  ld2r { v3.4s, v4.4s }, [x2]
+  ld2r { v3.4s, v4.4s }, [x2], x3
+  ld2r { v3.1d, v4.1d }, [x2]
+  ld2r { v3.1d, v4.1d }, [x2], x3
+  ld2r { v3.2d, v4.2d }, [x2]
+  ld2r { v3.2d, v4.2d }, [x2], x3
+
+  ld2r { v3.8b, v4.8b }, [x2], #2
+  ld2r { v3.16b, v4.16b }, [x2], #2
+  ld2r { v3.4h, v4.4h }, [x2], #4
+  ld2r { v3.8h, v4.8h }, [x2], #4
+  ld2r { v3.2s, v4.2s }, [x2], #8
+  ld2r { v3.4s, v4.4s }, [x2], #8
+  ld2r { v3.1d, v4.1d }, [x2], #16
+  ld2r { v3.2d, v4.2d }, [x2], #16
+
+  ld3r { v2.8b, v3.8b, v4.8b }, [x2]
+  ld3r { v2.8b, v3.8b, v4.8b }, [x2], x3
+  ld3r { v2.16b, v3.16b, v4.16b }, [x2]
+  ld3r { v2.16b, v3.16b, v4.16b }, [x2], x3
+  ld3r { v2.4h, v3.4h, v4.4h }, [x2]
+  ld3r { v2.4h, v3.4h, v4.4h }, [x2], x3
+  ld3r { v2.8h, v3.8h, v4.8h }, [x2]
+  ld3r { v2.8h, v3.8h, v4.8h }, [x2], x3
+  ld3r { v2.2s, v3.2s, v4.2s }, [x2]
+  ld3r { v2.2s, v3.2s, v4.2s }, [x2], x3
+  ld3r { v2.4s, v3.4s, v4.4s }, [x2]
+  ld3r { v2.4s, v3.4s, v4.4s }, [x2], x3
+  ld3r { v2.1d, v3.1d, v4.1d }, [x2]
+  ld3r { v2.1d, v3.1d, v4.1d }, [x2], x3
+  ld3r { v2.2d, v3.2d, v4.2d }, [x2]
+  ld3r { v2.2d, v3.2d, v4.2d }, [x2], x3
+
+  ld3r { v2.8b, v3.8b, v4.8b }, [x2], #3
+  ld3r { v2.16b, v3.16b, v4.16b }, [x2], #3
+  ld3r { v2.4h, v3.4h, v4.4h }, [x2], #6
+  ld3r { v2.8h, v3.8h, v4.8h }, [x2], #6
+  ld3r { v2.2s, v3.2s, v4.2s }, [x2], #12
+  ld3r { v2.4s, v3.4s, v4.4s }, [x2], #12
+  ld3r { v2.1d, v3.1d, v4.1d }, [x2], #24
+  ld3r { v2.2d, v3.2d, v4.2d }, [x2], #24
+
+  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2]
+  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2], x3
+  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2]
+  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2], x3
+  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2]
+  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2], x3
+  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2]
+  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2], x3
+  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2]
+  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2], x3
+  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2]
+  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2], x3
+  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2]
+  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2], x3
+  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2]
+  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2], x3
+
+  ld4r { v2.8b, v3.8b, v4.8b, v5.8b }, [x2], #4
+  ld4r { v2.16b, v3.16b, v4.16b, v5.16b }, [x2], #4
+  ld4r { v2.4h, v3.4h, v4.4h, v5.4h }, [x2], #8
+  ld4r { v2.8h, v3.8h, v4.8h, v5.8h }, [x2], #8
+  ld4r { v2.2s, v3.2s, v4.2s, v5.2s }, [x2], #16
+  ld4r { v2.4s, v3.4s, v4.4s, v5.4s }, [x2], #16
+  ld4r { v2.1d, v3.1d, v4.1d, v5.1d }, [x2], #32
+  ld4r { v2.2d, v3.2d, v4.2d, v5.2d }, [x2], #32
+
+  ld1 { v6.b }[13], [x3]
+  ld1 { v6.h }[2], [x3]
+  ld1 { v6.s }[2], [x3]
+  ld1 { v6.d }[1], [x3]
+  ld1 { v6.b }[13], [x3], x5
+  ld1 { v6.h }[2], [x3], x5
+  ld1 { v6.s }[2], [x3], x5
+  ld1 { v6.d }[1], [x3], x5
+  ld1 { v6.b }[13], [x3], #1
+  ld1 { v6.h }[2], [x3], #2
+  ld1 { v6.s }[2], [x3], #4
+  ld1 { v6.d }[1], [x3], #8
+
+  ld2 { v5.b, v6.b }[13], [x3]
+  ld2 { v5.h, v6.h }[2], [x3]
+  ld2 { v5.s, v6.s }[2], [x3]
+  ld2 { v5.d, v6.d }[1], [x3]
+  ld2 { v5.b, v6.b }[13], [x3], x5
+  ld2 { v5.h, v6.h }[2], [x3], x5
+  ld2 { v5.s, v6.s }[2], [x3], x5
+  ld2 { v5.d, v6.d }[1], [x3], x5
+  ld2 { v5.b, v6.b }[13], [x3], #2
+  ld2 { v5.h, v6.h }[2], [x3], #4
+  ld2 { v5.s, v6.s }[2], [x3], #8
+  ld2 { v5.d, v6.d }[1], [x3], #16
+
+  ld3 { v7.b, v8.b, v9.b }[13], [x3]
+  ld3 { v7.h, v8.h, v9.h }[2], [x3]
+  ld3 { v7.s, v8.s, v9.s }[2], [x3]
+  ld3 { v7.d, v8.d, v9.d }[1], [x3]
+  ld3 { v7.b, v8.b, v9.b }[13], [x3], x5
+  ld3 { v7.h, v8.h, v9.h }[2], [x3], x5
+  ld3 { v7.s, v8.s, v9.s }[2], [x3], x5
+  ld3 { v7.d, v8.d, v9.d }[1], [x3], x5
+  ld3 { v7.b, v8.b, v9.b }[13], [x3], #3
+  ld3 { v7.h, v8.h, v9.h }[2], [x3], #6
+  ld3 { v7.s, v8.s, v9.s }[2], [x3], #12
+  ld3 { v7.d, v8.d, v9.d }[1], [x3], #24
+
+  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3]
+  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3]
+  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3]
+  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3]
+  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], x5
+  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], x5
+  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], x5
+  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], x5
+  ld4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], #4
+  ld4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], #8
+  ld4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], #16
+  ld4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], #32
+
+  st1 { v6.b }[13], [x3]
+  st1 { v6.h }[2], [x3]
+  st1 { v6.s }[2], [x3]
+  st1 { v6.d }[1], [x3]
+  st1 { v6.b }[13], [x3], x5
+  st1 { v6.h }[2], [x3], x5
+  st1 { v6.s }[2], [x3], x5
+  st1 { v6.d }[1], [x3], x5
+  st1 { v6.b }[13], [x3], #1
+  st1 { v6.h }[2], [x3], #2
+  st1 { v6.s }[2], [x3], #4
+  st1 { v6.d }[1], [x3], #8
+
+
+  st2 { v5.b, v6.b }[13], [x3]
+  st2 { v5.h, v6.h }[2], [x3]
+  st2 { v5.s, v6.s }[2], [x3]
+  st2 { v5.d, v6.d }[1], [x3]
+  st2 { v5.b, v6.b }[13], [x3], x5
+  st2 { v5.h, v6.h }[2], [x3], x5
+  st2 { v5.s, v6.s }[2], [x3], x5
+  st2 { v5.d, v6.d }[1], [x3], x5
+  st2 { v5.b, v6.b }[13], [x3], #2
+  st2 { v5.h, v6.h }[2], [x3], #4
+  st2 { v5.s, v6.s }[2], [x3], #8
+  st2 { v5.d, v6.d }[1], [x3], #16
+
+  st3 { v7.b, v8.b, v9.b }[13], [x3]
+  st3 { v7.h, v8.h, v9.h }[2], [x3]
+  st3 { v7.s, v8.s, v9.s }[2], [x3]
+  st3 { v7.d, v8.d, v9.d }[1], [x3]
+  st3 { v7.b, v8.b, v9.b }[13], [x3], x5
+  st3 { v7.h, v8.h, v9.h }[2], [x3], x5
+  st3 { v7.s, v8.s, v9.s }[2], [x3], x5
+  st3 { v7.d, v8.d, v9.d }[1], [x3], x5
+  st3 { v7.b, v8.b, v9.b }[13], [x3], #3
+  st3 { v7.h, v8.h, v9.h }[2], [x3], #6
+  st3 { v7.s, v8.s, v9.s }[2], [x3], #12
+  st3 { v7.d, v8.d, v9.d }[1], [x3], #24
+
+  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3]
+  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3]
+  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3]
+  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3]
+  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], x5
+  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], x5
+  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], x5
+  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], x5
+  st4 { v7.b, v8.b, v9.b, v10.b }[13], [x3], #4
+  st4 { v7.h, v8.h, v9.h, v10.h }[2], [x3], #8
+  st4 { v7.s, v8.s, v9.s, v10.s }[2], [x3], #16
+  st4 { v7.d, v8.d, v9.d, v10.d }[1], [x3], #32
+
+; CHECK: ld1.8b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x40,0x0c]
+; CHECK: ld1.8b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x40,0x0c]
+; CHECK: ld1.8b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x40,0x0c]
+; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x40,0x0c]
+; CHECK: ld1.16b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x40,0x4c]
+; CHECK: ld1.16b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x40,0x4c]
+; CHECK: ld1.16b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x40,0x4c]
+; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x40,0x4c]
+; CHECK: ld1.4h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x40,0x0c]
+; CHECK: ld1.4h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x40,0x0c]
+; CHECK: ld1.4h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x40,0x0c]
+; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x40,0x0c]
+; CHECK: ld1.8h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x40,0x4c]
+; CHECK: ld1.8h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x40,0x4c]
+; CHECK: ld1.8h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x40,0x4c]
+; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x40,0x4c]
+; CHECK: ld1.2s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x40,0x0c]
+; CHECK: ld1.2s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x40,0x0c]
+; CHECK: ld1.2s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x40,0x0c]
+; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x40,0x0c]
+; CHECK: ld1.4s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x40,0x4c]
+; CHECK: ld1.4s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x40,0x4c]
+; CHECK: ld1.4s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x40,0x4c]
+; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x40,0x4c]
+; CHECK: ld1.1d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x40,0x0c]
+; CHECK: ld1.1d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x40,0x0c]
+; CHECK: ld1.1d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x40,0x0c]
+; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x40,0x0c]
+; CHECK: ld1.2d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x40,0x4c]
+; CHECK: ld1.2d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x40,0x4c]
+; CHECK: ld1.2d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x40,0x4c]
+; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x40,0x4c]
+; CHECK: st1.8b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x00,0x0c]
+; CHECK: st1.8b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x00,0x0c]
+; CHECK: st1.8b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x00,0x0c]
+; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x00,0x0c]
+; CHECK: st1.16b	{ v1 }, [x1]            ; encoding: [0x21,0x70,0x00,0x4c]
+; CHECK: st1.16b	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa0,0x00,0x4c]
+; CHECK: st1.16b	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x60,0x00,0x4c]
+; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1] ; encoding: [0x24,0x20,0x00,0x4c]
+; CHECK: st1.4h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x00,0x0c]
+; CHECK: st1.4h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x00,0x0c]
+; CHECK: st1.4h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x00,0x0c]
+; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x00,0x0c]
+; CHECK: st1.8h	{ v1 }, [x1]            ; encoding: [0x21,0x74,0x00,0x4c]
+; CHECK: st1.8h	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa4,0x00,0x4c]
+; CHECK: st1.8h	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x64,0x00,0x4c]
+; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x24,0x00,0x4c]
+; CHECK: st1.2s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x00,0x0c]
+; CHECK: st1.2s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x00,0x0c]
+; CHECK: st1.2s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x00,0x0c]
+; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x00,0x0c]
+; CHECK: st1.4s	{ v1 }, [x1]            ; encoding: [0x21,0x78,0x00,0x4c]
+; CHECK: st1.4s	{ v2, v3 }, [x1]        ; encoding: [0x22,0xa8,0x00,0x4c]
+; CHECK: st1.4s	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x68,0x00,0x4c]
+; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x28,0x00,0x4c]
+; CHECK: st1.1d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x00,0x0c]
+; CHECK: st1.1d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x00,0x0c]
+; CHECK: st1.1d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x00,0x0c]
+; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x00,0x0c]
+; CHECK: st1.2d	{ v1 }, [x1]            ; encoding: [0x21,0x7c,0x00,0x4c]
+; CHECK: st1.2d	{ v2, v3 }, [x1]        ; encoding: [0x22,0xac,0x00,0x4c]
+; CHECK: st1.2d	{ v3, v4, v5 }, [x1]    ; encoding: [0x23,0x6c,0x00,0x4c]
+; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1] ; encoding: [0x27,0x2c,0x00,0x4c]
+; CHECK: ld2.8b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x40,0x0c]
+; CHECK: ld2.16b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x40,0x4c]
+; CHECK: ld2.4h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x40,0x0c]
+; CHECK: ld2.8h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x40,0x4c]
+; CHECK: ld2.2s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x40,0x0c]
+; CHECK: ld2.4s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x40,0x4c]
+; CHECK: ld2.2d	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8e,0x40,0x4c]
+; CHECK: st2.8b	{ v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x00,0x0c]
+; CHECK: st2.16b { v3, v4 }, [x19]       ; encoding: [0x63,0x82,0x00,0x4c]
+; CHECK: st2.4h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x00,0x0c]
+; CHECK: st2.8h	{ v3, v4 }, [x19]       ; encoding: [0x63,0x86,0x00,0x4c]
+; CHECK: st2.2s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x00,0x0c]
+; CHECK: st2.4s	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8a,0x00,0x4c]
+; CHECK: st2.2d	{ v3, v4 }, [x19]       ; encoding: [0x63,0x8e,0x00,0x4c]
+; CHECK: ld3.8b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x40,0x0c]
+; CHECK: ld3.16b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x40,0x4c]
+; CHECK: ld3.4h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x40,0x0c]
+; CHECK: ld3.8h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x40,0x4c]
+; CHECK: ld3.2s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x0c]
+; CHECK: ld3.4s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x40,0x4c]
+; CHECK: ld3.2d	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4e,0x40,0x4c]
+; CHECK: st3.8b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x00,0x0c]
+; CHECK: st3.16b	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x42,0x00,0x4c]
+; CHECK: st3.4h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x00,0x0c]
+; CHECK: st3.8h	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x46,0x00,0x4c]
+; CHECK: st3.2s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x0c]
+; CHECK: st3.4s	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4a,0x00,0x4c]
+; CHECK: st3.2d	{ v2, v3, v4 }, [x19]   ; encoding: [0x62,0x4e,0x00,0x4c]
+; CHECK: ld4.8b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x40,0x0c]
+; CHECK: ld4.16b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x40,0x4c]
+; CHECK: ld4.4h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x40,0x0c]
+; CHECK: ld4.8h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x40,0x4c]
+; CHECK: ld4.2s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x40,0x0c]
+; CHECK: ld4.4s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x40,0x4c]
+; CHECK: ld4.2d	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0e,0x40,0x4c]
+; CHECK: st4.8b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x00,0x0c]
+; CHECK: st4.16b	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x02,0x00,0x4c]
+; CHECK: st4.4h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x00,0x0c]
+; CHECK: st4.8h	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x06,0x00,0x4c]
+; CHECK: st4.2s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x00,0x0c]
+; CHECK: st4.4s	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0a,0x00,0x4c]
+; CHECK: st4.2d	{ v2, v3, v4, v5 }, [x19] ; encoding: [0x62,0x0e,0x00,0x4c]
+; CHECK: ld1.8b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0xcf,0x0c]
+; CHECK: ld1.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0xcf,0x0c]
+; CHECK: ld1.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0xcf,0x0c]
+; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0xcf,0x0c]
+; CHECK: ld1.16b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0xcf,0x4c]
+; CHECK: ld1.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0xcf,0x4c]
+; CHECK: ld1.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0xcf,0x4c]
+; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0xcf,0x4c]
+; CHECK: ld1.4h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0xcf,0x0c]
+; CHECK: ld1.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0xcf,0x0c]
+; CHECK: ld1.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0xcf,0x0c]
+; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0xcf,0x0c]
+; CHECK: ld1.8h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0xcf,0x4c]
+; CHECK: ld1.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0xcf,0x4c]
+; CHECK: ld1.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0xcf,0x4c]
+; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0xcf,0x4c]
+; CHECK: ld1.2s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0xcf,0x0c]
+; CHECK: ld1.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0xcf,0x0c]
+; CHECK: ld1.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0xcf,0x0c]
+; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0xcf,0x0c]
+; CHECK: ld1.4s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0xcf,0x4c]
+; CHECK: ld1.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0xcf,0x4c]
+; CHECK: ld1.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0xcf,0x4c]
+; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0xcf,0x4c]
+; CHECK: ld1.1d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0xcf,0x0c]
+; CHECK: ld1.1d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0xcf,0x0c]
+; CHECK: ld1.1d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0xcf,0x0c]
+; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0xcf,0x0c]
+; CHECK: ld1.2d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0xcf,0x4c]
+; CHECK: ld1.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0xcf,0x4c]
+; CHECK: ld1.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0xcf,0x4c]
+; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0xcf,0x4c]
+; CHECK: st1.8b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0x8f,0x0c]
+; CHECK: st1.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0x8f,0x0c]
+; CHECK: st1.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0x8f,0x0c]
+; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0x8f,0x0c]
+; CHECK: st1.16b	{ v1 }, [x1], x15       ; encoding: [0x21,0x70,0x8f,0x4c]
+; CHECK: st1.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa0,0x8f,0x4c]
+; CHECK: st1.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x60,0x8f,0x4c]
+; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x20,0x8f,0x4c]
+; CHECK: st1.4h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0x8f,0x0c]
+; CHECK: st1.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0x8f,0x0c]
+; CHECK: st1.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0x8f,0x0c]
+; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0x8f,0x0c]
+; CHECK: st1.8h	{ v1 }, [x1], x15       ; encoding: [0x21,0x74,0x8f,0x4c]
+; CHECK: st1.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa4,0x8f,0x4c]
+; CHECK: st1.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x64,0x8f,0x4c]
+; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x24,0x8f,0x4c]
+; CHECK: st1.2s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0x8f,0x0c]
+; CHECK: st1.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0x8f,0x0c]
+; CHECK: st1.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0x8f,0x0c]
+; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0x8f,0x0c]
+; CHECK: st1.4s	{ v1 }, [x1], x15       ; encoding: [0x21,0x78,0x8f,0x4c]
+; CHECK: st1.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xa8,0x8f,0x4c]
+; CHECK: st1.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x68,0x8f,0x4c]
+; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x28,0x8f,0x4c]
+; CHECK: st1.1d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0x8f,0x0c]
+; CHECK: st1.1d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0x8f,0x0c]
+; CHECK: st1.1d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0x8f,0x0c]
+; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0x8f,0x0c]
+; CHECK: st1.2d	{ v1 }, [x1], x15       ; encoding: [0x21,0x7c,0x8f,0x4c]
+; CHECK: st1.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0xac,0x8f,0x4c]
+; CHECK: st1.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x6c,0x8f,0x4c]
+; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x2c,0x8f,0x4c]
+; CHECK: ld1.8b	{ v1 }, [x1], #8       ; encoding: [0x21,0x70,0xdf,0x0c]
+; CHECK: ld1.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa0,0xdf,0x0c]
+; CHECK: ld1.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x60,0xdf,0x0c]
+; CHECK: ld1.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x20,0xdf,0x0c]
+; CHECK: ld1.16b	{ v1 }, [x1], #16       ; encoding: [0x21,0x70,0xdf,0x4c]
+; CHECK: ld1.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa0,0xdf,0x4c]
+; CHECK: ld1.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x60,0xdf,0x4c]
+; CHECK: ld1.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x20,0xdf,0x4c]
+; CHECK: ld1.4h	{ v1 }, [x1], #8       ; encoding: [0x21,0x74,0xdf,0x0c]
+; CHECK: ld1.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa4,0xdf,0x0c]
+; CHECK: ld1.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x64,0xdf,0x0c]
+; CHECK: ld1.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x24,0xdf,0x0c]
+; CHECK: ld1.8h	{ v1 }, [x1], #16       ; encoding: [0x21,0x74,0xdf,0x4c]
+; CHECK: ld1.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa4,0xdf,0x4c]
+; CHECK: ld1.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x64,0xdf,0x4c]
+; CHECK: ld1.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x24,0xdf,0x4c]
+; CHECK: ld1.2s	{ v1 }, [x1], #8       ; encoding: [0x21,0x78,0xdf,0x0c]
+; CHECK: ld1.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa8,0xdf,0x0c]
+; CHECK: ld1.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x68,0xdf,0x0c]
+; CHECK: ld1.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x28,0xdf,0x0c]
+; CHECK: ld1.4s	{ v1 }, [x1], #16       ; encoding: [0x21,0x78,0xdf,0x4c]
+; CHECK: ld1.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa8,0xdf,0x4c]
+; CHECK: ld1.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x68,0xdf,0x4c]
+; CHECK: ld1.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x28,0xdf,0x4c]
+; CHECK: ld1.1d	{ v1 }, [x1], #8       ; encoding: [0x21,0x7c,0xdf,0x0c]
+; CHECK: ld1.1d	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xac,0xdf,0x0c]
+; CHECK: ld1.1d	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x6c,0xdf,0x0c]
+; CHECK: ld1.1d	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x2c,0xdf,0x0c]
+; CHECK: ld1.2d	{ v1 }, [x1], #16       ; encoding: [0x21,0x7c,0xdf,0x4c]
+; CHECK: ld1.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xac,0xdf,0x4c]
+; CHECK: ld1.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x6c,0xdf,0x4c]
+; CHECK: ld1.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x2c,0xdf,0x4c]
+; CHECK: st1.8b	{ v1 }, [x1], #8       ; encoding: [0x21,0x70,0x9f,0x0c]
+; CHECK: st1.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa0,0x9f,0x0c]
+; CHECK: st1.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x60,0x9f,0x0c]
+; CHECK: st1.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x20,0x9f,0x0c]
+; CHECK: st1.16b	{ v1 }, [x1], #16       ; encoding: [0x21,0x70,0x9f,0x4c]
+; CHECK: st1.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa0,0x9f,0x4c]
+; CHECK: st1.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x60,0x9f,0x4c]
+; CHECK: st1.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x20,0x9f,0x4c]
+; CHECK: st1.4h	{ v1 }, [x1], #8       ; encoding: [0x21,0x74,0x9f,0x0c]
+; CHECK: st1.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa4,0x9f,0x0c]
+; CHECK: st1.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x64,0x9f,0x0c]
+; CHECK: st1.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x24,0x9f,0x0c]
+; CHECK: st1.8h	{ v1 }, [x1], #16       ; encoding: [0x21,0x74,0x9f,0x4c]
+; CHECK: st1.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa4,0x9f,0x4c]
+; CHECK: st1.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x64,0x9f,0x4c]
+; CHECK: st1.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x24,0x9f,0x4c]
+; CHECK: st1.2s	{ v1 }, [x1], #8       ; encoding: [0x21,0x78,0x9f,0x0c]
+; CHECK: st1.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xa8,0x9f,0x0c]
+; CHECK: st1.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x68,0x9f,0x0c]
+; CHECK: st1.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x28,0x9f,0x0c]
+; CHECK: st1.4s	{ v1 }, [x1], #16       ; encoding: [0x21,0x78,0x9f,0x4c]
+; CHECK: st1.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xa8,0x9f,0x4c]
+; CHECK: st1.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x68,0x9f,0x4c]
+; CHECK: st1.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x28,0x9f,0x4c]
+; CHECK: st1.1d	{ v1 }, [x1], #8       ; encoding: [0x21,0x7c,0x9f,0x0c]
+; CHECK: st1.1d	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0xac,0x9f,0x0c]
+; CHECK: st1.1d	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x6c,0x9f,0x0c]
+; CHECK: st1.1d	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x2c,0x9f,0x0c]
+; CHECK: st1.2d	{ v1 }, [x1], #16       ; encoding: [0x21,0x7c,0x9f,0x4c]
+; CHECK: st1.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0xac,0x9f,0x4c]
+; CHECK: st1.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x6c,0x9f,0x4c]
+; CHECK: st1.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x2c,0x9f,0x4c]
+; CHECK: ld2.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0xcf,0x0c]
+; CHECK: ld2.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0xcf,0x4c]
+; CHECK: ld2.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0xcf,0x0c]
+; CHECK: ld2.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0xcf,0x4c]
+; CHECK: ld2.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0xcf,0x0c]
+; CHECK: ld2.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0xcf,0x4c]
+; CHECK: ld2.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x8c,0xcf,0x4c]
+; CHECK: st2.8b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0x8f,0x0c]
+; CHECK: st2.16b	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x80,0x8f,0x4c]
+; CHECK: st2.4h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0x8f,0x0c]
+; CHECK: st2.8h	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x84,0x8f,0x4c]
+; CHECK: st2.2s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0x8f,0x0c]
+; CHECK: st2.4s	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x88,0x8f,0x4c]
+; CHECK: st2.2d	{ v2, v3 }, [x1], x15   ; encoding: [0x22,0x8c,0x8f,0x4c]
+; CHECK: ld2.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x80,0xdf,0x0c]
+; CHECK: ld2.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x80,0xdf,0x4c]
+; CHECK: ld2.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x84,0xdf,0x0c]
+; CHECK: ld2.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x84,0xdf,0x4c]
+; CHECK: ld2.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x88,0xdf,0x0c]
+; CHECK: ld2.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x88,0xdf,0x4c]
+; CHECK: ld2.2d	{ v2, v3 }, [x1], #32	; encoding: [0x22,0x8c,0xdf,0x4c]
+; CHECK: st2.8b	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x80,0x9f,0x0c]
+; CHECK: st2.16b	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x80,0x9f,0x4c]
+; CHECK: st2.4h	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x84,0x9f,0x0c]
+; CHECK: st2.8h	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x84,0x9f,0x4c]
+; CHECK: st2.2s	{ v2, v3 }, [x1], #16   ; encoding: [0x22,0x88,0x9f,0x0c]
+; CHECK: st2.4s	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x88,0x9f,0x4c]
+; CHECK: st2.2d	{ v2, v3 }, [x1], #32   ; encoding: [0x22,0x8c,0x9f,0x4c]
+; CHECK: ld3.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0xcf,0x0c]
+; CHECK: ld3.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0xcf,0x4c]
+; CHECK: ld3.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0xcf,0x0c]
+; CHECK: ld3.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0xcf,0x4c]
+; CHECK: ld3.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0xcf,0x0c]
+; CHECK: ld3.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0xcf,0x4c]
+; CHECK: ld3.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x4c,0xcf,0x4c]
+; CHECK: st3.8b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0x8f,0x0c]
+; CHECK: st3.16b	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x40,0x8f,0x4c]
+; CHECK: st3.4h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0x8f,0x0c]
+; CHECK: st3.8h	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x44,0x8f,0x4c]
+; CHECK: st3.2s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0x8f,0x0c]
+; CHECK: st3.4s	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x48,0x8f,0x4c]
+; CHECK: st3.2d	{ v3, v4, v5 }, [x1], x15 ; encoding: [0x23,0x4c,0x8f,0x4c]
+; CHECK: ld3.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x40,0xdf,0x0c]
+; CHECK: ld3.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x40,0xdf,0x4c]
+; CHECK: ld3.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x44,0xdf,0x0c]
+; CHECK: ld3.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x44,0xdf,0x4c]
+; CHECK: ld3.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x48,0xdf,0x0c]
+; CHECK: ld3.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x48,0xdf,0x4c]
+; CHECK: ld3.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x4c,0xdf,0x4c]
+; CHECK: st3.8b	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x40,0x9f,0x0c]
+; CHECK: st3.16b	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x40,0x9f,0x4c]
+; CHECK: st3.4h	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x44,0x9f,0x0c]
+; CHECK: st3.8h	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x44,0x9f,0x4c]
+; CHECK: st3.2s	{ v3, v4, v5 }, [x1], #24 ; encoding: [0x23,0x48,0x9f,0x0c]
+; CHECK: st3.4s	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x48,0x9f,0x4c]
+; CHECK: st3.2d	{ v3, v4, v5 }, [x1], #48 ; encoding: [0x23,0x4c,0x9f,0x4c]
+; CHECK: ld4.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0xcf,0x0c]
+; CHECK: ld4.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0xcf,0x4c]
+; CHECK: ld4.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0xcf,0x0c]
+; CHECK: ld4.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0xcf,0x4c]
+; CHECK: ld4.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0xcf,0x0c]
+; CHECK: ld4.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0xcf,0x4c]
+; CHECK: ld4.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x0c,0xcf,0x4c]
+; CHECK: st4.8b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0x8f,0x0c]
+; CHECK: st4.16b	{ v4, v5, v6, v7 }, [x1], x15 ; encoding: [0x24,0x00,0x8f,0x4c]
+; CHECK: st4.4h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0x8f,0x0c]
+; CHECK: st4.8h	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x04,0x8f,0x4c]
+; CHECK: st4.2s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0x8f,0x0c]
+; CHECK: st4.4s	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x08,0x8f,0x4c]
+; CHECK: st4.2d	{ v7, v8, v9, v10 }, [x1], x15 ; encoding: [0x27,0x0c,0x8f,0x4c]
+; CHECK: ld4.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x00,0xdf,0x0c]
+; CHECK: ld4.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x00,0xdf,0x4c]
+; CHECK: ld4.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x04,0xdf,0x0c]
+; CHECK: ld4.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x04,0xdf,0x4c]
+; CHECK: ld4.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x08,0xdf,0x0c]
+; CHECK: ld4.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x08,0xdf,0x4c]
+; CHECK: ld4.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x0c,0xdf,0x4c]
+; CHECK: st4.8b	{ v4, v5, v6, v7 }, [x1], #32 ; encoding: [0x24,0x00,0x9f,0x0c]
+; CHECK: st4.16b	{ v4, v5, v6, v7 }, [x1], #64 ; encoding: [0x24,0x00,0x9f,0x4c]
+; CHECK: st4.4h	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x04,0x9f,0x0c]
+; CHECK: st4.8h	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x04,0x9f,0x4c]
+; CHECK: st4.2s	{ v7, v8, v9, v10 }, [x1], #32 ; encoding: [0x27,0x08,0x9f,0x0c]
+; CHECK: st4.4s	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x08,0x9f,0x4c]
+; CHECK: st4.2d	{ v7, v8, v9, v10 }, [x1], #64 ; encoding: [0x27,0x0c,0x9f,0x4c]
+; CHECK: ld1r.8b	{ v12 }, [x2]           ; encoding: [0x4c,0xc0,0x40,0x0d]
+; CHECK: ld1r.8b	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc0,0xc3,0x0d]
+; CHECK: ld1r.16b	{ v12 }, [x2]   ; encoding: [0x4c,0xc0,0x40,0x4d]
+; CHECK: ld1r.16b	{ v12 }, [x2], x3 ; encoding: [0x4c,0xc0,0xc3,0x4d]
+; CHECK: ld1r.4h	{ v12 }, [x2]           ; encoding: [0x4c,0xc4,0x40,0x0d]
+; CHECK: ld1r.4h	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc4,0xc3,0x0d]
+; CHECK: ld1r.8h	{ v12 }, [x2]           ; encoding: [0x4c,0xc4,0x40,0x4d]
+; CHECK: ld1r.8h	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc4,0xc3,0x4d]
+; CHECK: ld1r.2s	{ v12 }, [x2]           ; encoding: [0x4c,0xc8,0x40,0x0d]
+; CHECK: ld1r.2s	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc8,0xc3,0x0d]
+; CHECK: ld1r.4s	{ v12 }, [x2]           ; encoding: [0x4c,0xc8,0x40,0x4d]
+; CHECK: ld1r.4s	{ v12 }, [x2], x3       ; encoding: [0x4c,0xc8,0xc3,0x4d]
+; CHECK: ld1r.1d	{ v12 }, [x2]           ; encoding: [0x4c,0xcc,0x40,0x0d]
+; CHECK: ld1r.1d	{ v12 }, [x2], x3       ; encoding: [0x4c,0xcc,0xc3,0x0d]
+; CHECK: ld1r.2d	{ v12 }, [x2]           ; encoding: [0x4c,0xcc,0x40,0x4d]
+; CHECK: ld1r.2d	{ v12 }, [x2], x3       ; encoding: [0x4c,0xcc,0xc3,0x4d]
+; CHECK: ld1r.8b	{ v12 }, [x2], #1      ; encoding: [0x4c,0xc0,0xdf,0x0d]
+; CHECK: ld1r.16b	{ v12 }, [x2], #1 ; encoding: [0x4c,0xc0,0xdf,0x4d]
+; CHECK: ld1r.4h	{ v12 }, [x2], #2      ; encoding: [0x4c,0xc4,0xdf,0x0d]
+; CHECK: ld1r.8h	{ v12 }, [x2], #2      ; encoding: [0x4c,0xc4,0xdf,0x4d]
+; CHECK: ld1r.2s	{ v12 }, [x2], #4      ; encoding: [0x4c,0xc8,0xdf,0x0d]
+; CHECK: ld1r.4s	{ v12 }, [x2], #4      ; encoding: [0x4c,0xc8,0xdf,0x4d]
+; CHECK: ld1r.1d	{ v12 }, [x2], #8      ; encoding: [0x4c,0xcc,0xdf,0x0d]
+; CHECK: ld1r.2d	{ v12 }, [x2], #8      ; encoding: [0x4c,0xcc,0xdf,0x4d]
+; CHECK: ld2r.8b	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc0,0x60,0x0d]
+; CHECK: ld2r.8b	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc0,0xe3,0x0d]
+; CHECK: ld2r.16b	{ v3, v4 }, [x2] ; encoding: [0x43,0xc0,0x60,0x4d]
+; CHECK: ld2r.16b	{ v3, v4 }, [x2], x3 ; encoding: [0x43,0xc0,0xe3,0x4d]
+; CHECK: ld2r.4h	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc4,0x60,0x0d]
+; CHECK: ld2r.4h	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc4,0xe3,0x0d]
+; CHECK: ld2r.8h	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc4,0x60,0x4d]
+; CHECK: ld2r.8h	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc4,0xe3,0x4d]
+; CHECK: ld2r.2s	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc8,0x60,0x0d]
+; CHECK: ld2r.2s	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc8,0xe3,0x0d]
+; CHECK: ld2r.4s	{ v3, v4 }, [x2]        ; encoding: [0x43,0xc8,0x60,0x4d]
+; CHECK: ld2r.4s	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xc8,0xe3,0x4d]
+; CHECK: ld2r.1d	{ v3, v4 }, [x2]        ; encoding: [0x43,0xcc,0x60,0x0d]
+; CHECK: ld2r.1d	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xcc,0xe3,0x0d]
+; CHECK: ld2r.2d	{ v3, v4 }, [x2]        ; encoding: [0x43,0xcc,0x60,0x4d]
+; CHECK: ld2r.2d	{ v3, v4 }, [x2], x3    ; encoding: [0x43,0xcc,0xe3,0x4d]
+; CHECK: ld2r.8b	{ v3, v4 }, [x2], #2   ; encoding: [0x43,0xc0,0xff,0x0d]
+; CHECK: ld2r.16b	{ v3, v4 }, [x2], #2 ; encoding: [0x43,0xc0,0xff,0x4d]
+; CHECK: ld2r.4h	{ v3, v4 }, [x2], #4   ; encoding: [0x43,0xc4,0xff,0x0d]
+; CHECK: ld2r.8h	{ v3, v4 }, [x2], #4   ; encoding: [0x43,0xc4,0xff,0x4d]
+; CHECK: ld2r.2s	{ v3, v4 }, [x2], #8   ; encoding: [0x43,0xc8,0xff,0x0d]
+; CHECK: ld2r.4s	{ v3, v4 }, [x2], #8   ; encoding: [0x43,0xc8,0xff,0x4d]
+; CHECK: ld2r.1d	{ v3, v4 }, [x2], #16   ; encoding: [0x43,0xcc,0xff,0x0d]
+; CHECK: ld2r.2d	{ v3, v4 }, [x2], #16   ; encoding: [0x43,0xcc,0xff,0x4d]
+; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe0,0x40,0x0d]
+; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe0,0xc3,0x0d]
+; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2] ; encoding: [0x42,0xe0,0x40,0x4d]
+; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe0,0xc3,0x4d]
+; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe4,0x40,0x0d]
+; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe4,0xc3,0x0d]
+; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe4,0x40,0x4d]
+; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe4,0xc3,0x4d]
+; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe8,0x40,0x0d]
+; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe8,0xc3,0x0d]
+; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xe8,0x40,0x4d]
+; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xe8,0xc3,0x4d]
+; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xec,0x40,0x0d]
+; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xec,0xc3,0x0d]
+; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2]    ; encoding: [0x42,0xec,0x40,0x4d]
+; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2], x3 ; encoding: [0x42,0xec,0xc3,0x4d]
+; CHECK: ld3r.8b	{ v2, v3, v4 }, [x2], #3 ; encoding: [0x42,0xe0,0xdf,0x0d]
+; CHECK: ld3r.16b	{ v2, v3, v4 }, [x2], #3 ; encoding: [0x42,0xe0,0xdf,0x4d]
+; CHECK: ld3r.4h	{ v2, v3, v4 }, [x2], #6 ; encoding: [0x42,0xe4,0xdf,0x0d]
+; CHECK: ld3r.8h	{ v2, v3, v4 }, [x2], #6 ; encoding: [0x42,0xe4,0xdf,0x4d]
+; CHECK: ld3r.2s	{ v2, v3, v4 }, [x2], #12 ; encoding: [0x42,0xe8,0xdf,0x0d]
+; CHECK: ld3r.4s	{ v2, v3, v4 }, [x2], #12 ; encoding: [0x42,0xe8,0xdf,0x4d]
+; CHECK: ld3r.1d	{ v2, v3, v4 }, [x2], #24 ; encoding: [0x42,0xec,0xdf,0x0d]
+; CHECK: ld3r.2d	{ v2, v3, v4 }, [x2], #24 ; encoding: [0x42,0xec,0xdf,0x4d]
+; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe0,0x60,0x0d]
+; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe0,0xe3,0x0d]
+; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe0,0x60,0x4d]
+; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe0,0xe3,0x4d]
+; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe4,0x60,0x0d]
+; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe4,0xe3,0x0d]
+; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe4,0x60,0x4d]
+; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe4,0xe3,0x4d]
+; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe8,0x60,0x0d]
+; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe8,0xe3,0x0d]
+; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xe8,0x60,0x4d]
+; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xe8,0xe3,0x4d]
+; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xec,0x60,0x0d]
+; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xec,0xe3,0x0d]
+; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2] ; encoding: [0x42,0xec,0x60,0x4d]
+; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2], x3 ; encoding: [0x42,0xec,0xe3,0x4d]
+; CHECK: ld4r.8b	{ v2, v3, v4, v5 }, [x2], #4 ; encoding: [0x42,0xe0,0xff,0x0d]
+; CHECK: ld4r.16b	{ v2, v3, v4, v5 }, [x2], #4 ; encoding: [0x42,0xe0,0xff,0x4d]
+; CHECK: ld4r.4h	{ v2, v3, v4, v5 }, [x2], #8 ; encoding: [0x42,0xe4,0xff,0x0d]
+; CHECK: ld4r.8h	{ v2, v3, v4, v5 }, [x2], #8 ; encoding: [0x42,0xe4,0xff,0x4d]
+; CHECK: ld4r.2s	{ v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x0d]
+; CHECK: ld4r.4s	{ v2, v3, v4, v5 }, [x2], #16 ; encoding: [0x42,0xe8,0xff,0x4d]
+; CHECK: ld4r.1d	{ v2, v3, v4, v5 }, [x2], #32 ; encoding: [0x42,0xec,0xff,0x0d]
+; CHECK: ld4r.2d	{ v2, v3, v4, v5 }, [x2], #32 ; encoding: [0x42,0xec,0xff,0x4d]
+; CHECK: ld1.b	{ v6 }[13], [x3]        ; encoding: [0x66,0x14,0x40,0x4d]
+; CHECK: ld1.h	{ v6 }[2], [x3]         ; encoding: [0x66,0x50,0x40,0x0d]
+; CHECK: ld1.s	{ v6 }[2], [x3]         ; encoding: [0x66,0x80,0x40,0x4d]
+; CHECK: ld1.d	{ v6 }[1], [x3]         ; encoding: [0x66,0x84,0x40,0x4d]
+; CHECK: ld1.b	{ v6 }[13], [x3], x5    ; encoding: [0x66,0x14,0xc5,0x4d]
+; CHECK: ld1.h	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x50,0xc5,0x0d]
+; CHECK: ld1.s	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x80,0xc5,0x4d]
+; CHECK: ld1.d	{ v6 }[1], [x3], x5     ; encoding: [0x66,0x84,0xc5,0x4d]
+; CHECK: ld1.b	{ v6 }[13], [x3], #1   ; encoding: [0x66,0x14,0xdf,0x4d]
+; CHECK: ld1.h	{ v6 }[2], [x3], #2    ; encoding: [0x66,0x50,0xdf,0x0d]
+; CHECK: ld1.s	{ v6 }[2], [x3], #4    ; encoding: [0x66,0x80,0xdf,0x4d]
+; CHECK: ld1.d	{ v6 }[1], [x3], #8    ; encoding: [0x66,0x84,0xdf,0x4d]
+; CHECK: ld2.b	{ v5, v6 }[13], [x3]    ; encoding: [0x65,0x14,0x60,0x4d]
+; CHECK: ld2.h	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x50,0x60,0x0d]
+; CHECK: ld2.s	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x80,0x60,0x4d]
+; CHECK: ld2.d	{ v5, v6 }[1], [x3]     ; encoding: [0x65,0x84,0x60,0x4d]
+; CHECK: ld2.b	{ v5, v6 }[13], [x3], x5 ; encoding: [0x65,0x14,0xe5,0x4d]
+; CHECK: ld2.h	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x50,0xe5,0x0d]
+; CHECK: ld2.s	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x80,0xe5,0x4d]
+; CHECK: ld2.d	{ v5, v6 }[1], [x3], x5 ; encoding: [0x65,0x84,0xe5,0x4d]
+; CHECK: ld2.b	{ v5, v6 }[13], [x3], #2 ; encoding: [0x65,0x14,0xff,0x4d]
+; CHECK: ld2.h	{ v5, v6 }[2], [x3], #4 ; encoding: [0x65,0x50,0xff,0x0d]
+; CHECK: ld2.s	{ v5, v6 }[2], [x3], #8 ; encoding: [0x65,0x80,0xff,0x4d]
+; CHECK: ld2.d	{ v5, v6 }[1], [x3], #16 ; encoding: [0x65,0x84,0xff,0x4d]
+; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3] ; encoding: [0x67,0x34,0x40,0x4d]
+; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0x70,0x40,0x0d]
+; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0xa0,0x40,0x4d]
+; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3] ; encoding: [0x67,0xa4,0x40,0x4d]
+; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3], x5 ; encoding: [0x67,0x34,0xc5,0x4d]
+; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0x70,0xc5,0x0d]
+; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xc5,0x4d]
+; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xc5,0x4d]
+; CHECK: ld3.b	{ v7, v8, v9 }[13], [x3], #3 ; encoding: [0x67,0x34,0xdf,0x4d]
+; CHECK: ld3.h	{ v7, v8, v9 }[2], [x3], #6 ; encoding: [0x67,0x70,0xdf,0x0d]
+; CHECK: ld3.s	{ v7, v8, v9 }[2], [x3], #12 ; encoding: [0x67,0xa0,0xdf,0x4d]
+; CHECK: ld3.d	{ v7, v8, v9 }[1], [x3], #24 ; encoding: [0x67,0xa4,0xdf,0x4d]
+; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3] ; encoding: [0x67,0x34,0x60,0x4d]
+; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0x70,0x60,0x0d]
+; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0xa0,0x60,0x4d]
+; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3] ; encoding: [0x67,0xa4,0x60,0x4d]
+; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3], x5 ; encoding: [0x67,0x34,0xe5,0x4d]
+; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0x70,0xe5,0x0d]
+; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xe5,0x4d]
+; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xe5,0x4d]
+; CHECK: ld4.b	{ v7, v8, v9, v10 }[13], [x3], #4 ; encoding: [0x67,0x34,0xff,0x4d]
+; CHECK: ld4.h	{ v7, v8, v9, v10 }[2], [x3], #8 ; encoding: [0x67,0x70,0xff,0x0d]
+; CHECK: ld4.s	{ v7, v8, v9, v10 }[2], [x3], #16 ; encoding: [0x67,0xa0,0xff,0x4d]
+; CHECK: ld4.d	{ v7, v8, v9, v10 }[1], [x3], #32 ; encoding: [0x67,0xa4,0xff,0x4d]
+; CHECK: st1.b	{ v6 }[13], [x3]        ; encoding: [0x66,0x14,0x00,0x4d]
+; CHECK: st1.h	{ v6 }[2], [x3]         ; encoding: [0x66,0x50,0x00,0x0d]
+; CHECK: st1.s	{ v6 }[2], [x3]         ; encoding: [0x66,0x80,0x00,0x4d]
+; CHECK: st1.d	{ v6 }[1], [x3]         ; encoding: [0x66,0x84,0x00,0x4d]
+; CHECK: st1.b	{ v6 }[13], [x3], x5    ; encoding: [0x66,0x14,0x85,0x4d]
+; CHECK: st1.h	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x50,0x85,0x0d]
+; CHECK: st1.s	{ v6 }[2], [x3], x5     ; encoding: [0x66,0x80,0x85,0x4d]
+; CHECK: st1.d	{ v6 }[1], [x3], x5     ; encoding: [0x66,0x84,0x85,0x4d]
+; CHECK: st1.b	{ v6 }[13], [x3], #1   ; encoding: [0x66,0x14,0x9f,0x4d]
+; CHECK: st1.h	{ v6 }[2], [x3], #2    ; encoding: [0x66,0x50,0x9f,0x0d]
+; CHECK: st1.s	{ v6 }[2], [x3], #4    ; encoding: [0x66,0x80,0x9f,0x4d]
+; CHECK: st1.d	{ v6 }[1], [x3], #8    ; encoding: [0x66,0x84,0x9f,0x4d]
+; CHECK: st2.b	{ v5, v6 }[13], [x3]    ; encoding: [0x65,0x14,0x20,0x4d]
+; CHECK: st2.h	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x50,0x20,0x0d]
+; CHECK: st2.s	{ v5, v6 }[2], [x3]     ; encoding: [0x65,0x80,0x20,0x4d]
+; CHECK: st2.d	{ v5, v6 }[1], [x3]     ; encoding: [0x65,0x84,0x20,0x4d]
+; CHECK: st2.b	{ v5, v6 }[13], [x3], x5 ; encoding: [0x65,0x14,0xa5,0x4d]
+; CHECK: st2.h	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x50,0xa5,0x0d]
+; CHECK: st2.s	{ v5, v6 }[2], [x3], x5 ; encoding: [0x65,0x80,0xa5,0x4d]
+; CHECK: st2.d	{ v5, v6 }[1], [x3], x5 ; encoding: [0x65,0x84,0xa5,0x4d]
+; CHECK: st2.b	{ v5, v6 }[13], [x3], #2 ; encoding: [0x65,0x14,0xbf,0x4d]
+; CHECK: st2.h	{ v5, v6 }[2], [x3], #4 ; encoding: [0x65,0x50,0xbf,0x0d]
+; CHECK: st2.s	{ v5, v6 }[2], [x3], #8 ; encoding: [0x65,0x80,0xbf,0x4d]
+; CHECK: st2.d	{ v5, v6 }[1], [x3], #16 ; encoding: [0x65,0x84,0xbf,0x4d]
+; CHECK: st3.b	{ v7, v8, v9 }[13], [x3] ; encoding: [0x67,0x34,0x00,0x4d]
+; CHECK: st3.h	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0x70,0x00,0x0d]
+; CHECK: st3.s	{ v7, v8, v9 }[2], [x3] ; encoding: [0x67,0xa0,0x00,0x4d]
+; CHECK: st3.d	{ v7, v8, v9 }[1], [x3] ; encoding: [0x67,0xa4,0x00,0x4d]
+; CHECK: st3.b	{ v7, v8, v9 }[13], [x3], x5 ; encoding: [0x67,0x34,0x85,0x4d]
+; CHECK: st3.h	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0x70,0x85,0x0d]
+; CHECK: st3.s	{ v7, v8, v9 }[2], [x3], x5 ; encoding: [0x67,0xa0,0x85,0x4d]
+; CHECK: st3.d	{ v7, v8, v9 }[1], [x3], x5 ; encoding: [0x67,0xa4,0x85,0x4d]
+; CHECK: st3.b	{ v7, v8, v9 }[13], [x3], #3 ; encoding: [0x67,0x34,0x9f,0x4d]
+; CHECK: st3.h	{ v7, v8, v9 }[2], [x3], #6 ; encoding: [0x67,0x70,0x9f,0x0d]
+; CHECK: st3.s	{ v7, v8, v9 }[2], [x3], #12 ; encoding: [0x67,0xa0,0x9f,0x4d]
+; CHECK: st3.d	{ v7, v8, v9 }[1], [x3], #24 ; encoding: [0x67,0xa4,0x9f,0x4d]
+; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3] ; encoding: [0x67,0x34,0x20,0x4d]
+; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0x70,0x20,0x0d]
+; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3] ; encoding: [0x67,0xa0,0x20,0x4d]
+; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3] ; encoding: [0x67,0xa4,0x20,0x4d]
+; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3], x5 ; encoding: [0x67,0x34,0xa5,0x4d]
+; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0x70,0xa5,0x0d]
+; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3], x5 ; encoding: [0x67,0xa0,0xa5,0x4d]
+; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3], x5 ; encoding: [0x67,0xa4,0xa5,0x4d]
+; CHECK: st4.b	{ v7, v8, v9, v10 }[13], [x3], #4 ; encoding: [0x67,0x34,0xbf,0x4d]
+; CHECK: st4.h	{ v7, v8, v9, v10 }[2], [x3], #8 ; encoding: [0x67,0x70,0xbf,0x0d]
+; CHECK: st4.s	{ v7, v8, v9, v10 }[2], [x3], #16 ; encoding: [0x67,0xa0,0xbf,0x4d]
+; CHECK: st4.d	{ v7, v8, v9, v10 }[1], [x3], #32 ; encoding: [0x67,0xa4,0xbf,0x4d]
diff --git a/test/MC/ARM64/small-data-fixups.s b/test/MC/ARM64/small-data-fixups.s
new file mode 100644
index 0000000000..3fe7c75c01
--- /dev/null
+++ b/test/MC/ARM64/small-data-fixups.s
@@ -0,0 +1,24 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o - %s | macho-dump | FileCheck %s
+
+foo:
+  .long 0
+bar:
+  .long 1
+
+baz:
+  .byte foo - bar
+  .short foo - bar
+
+; CHECK: # Relocation 0
+; CHECK: (('word-0', 0x9),
+; CHECK:  ('word-1', 0x1a000002)),
+; CHECK: # Relocation 1
+; CHECK: (('word-0', 0x9),
+; CHECK:  ('word-1', 0xa000001)),
+; CHECK: # Relocation 2
+; CHECK: (('word-0', 0x8),
+; CHECK:  ('word-1', 0x18000002)),
+; CHECK: # Relocation 3
+; CHECK: (('word-0', 0x8),
+; CHECK:  ('word-1', 0x8000001)),
+
diff --git a/test/MC/ARM64/system-encoding.s b/test/MC/ARM64/system-encoding.s
new file mode 100644
index 0000000000..9f0d3c4e44
--- /dev/null
+++ b/test/MC/ARM64/system-encoding.s
@@ -0,0 +1,679 @@
+; RUN: not llvm-mc -triple arm64-apple-darwin -show-encoding < %s 2> %t | FileCheck %s
+; RUN: FileCheck --check-prefix=CHECK-ERRORS < %t %s
+
+foo:
+
+;-----------------------------------------------------------------------------
+; Simple encodings (instuctions w/ no operands)
+;-----------------------------------------------------------------------------
+
+  nop
+  sev
+  sevl
+  wfe
+  wfi
+  yield
+
+; CHECK: nop                             ; encoding: [0x1f,0x20,0x03,0xd5]
+; CHECK: sev                             ; encoding: [0x9f,0x20,0x03,0xd5]
+; CHECK: sevl                            ; encoding: [0xbf,0x20,0x03,0xd5]
+; CHECK: wfe                             ; encoding: [0x5f,0x20,0x03,0xd5]
+; CHECK: wfi                             ; encoding: [0x7f,0x20,0x03,0xd5]
+; CHECK: yield                           ; encoding: [0x3f,0x20,0x03,0xd5]
+
+;-----------------------------------------------------------------------------
+; Single-immediate operand instructions
+;-----------------------------------------------------------------------------
+
+  clrex #10
+; CHECK: clrex #10  ; encoding: [0x5f,0x3a,0x03,0xd5]
+  isb #15
+  isb sy
+; CHECK: isb     ; encoding: [0xdf,0x3f,0x03,0xd5]
+; CHECK: isb     ; encoding: [0xdf,0x3f,0x03,0xd5]
+  dmb #3
+  dmb osh
+; CHECK: dmb osh    ; encoding: [0xbf,0x33,0x03,0xd5]
+; CHECK: dmb osh    ; encoding: [0xbf,0x33,0x03,0xd5]
+  dsb #7
+  dsb nsh
+; CHECK: dsb nsh    ; encoding: [0x9f,0x37,0x03,0xd5]
+; CHECK: dsb nsh    ; encoding: [0x9f,0x37,0x03,0xd5]
+
+;-----------------------------------------------------------------------------
+; Generic system instructions
+;-----------------------------------------------------------------------------
+  sys #2, c0, c5, #7
+; CHECK: encoding: [0xff,0x05,0x0a,0xd5]
+  sys #7, C6, c10, #7, x7
+; CHECK: encoding: [0xe7,0x6a,0x0f,0xd5]
+  sysl  x20, #6, c3, C15, #7
+; CHECK: encoding: [0xf4,0x3f,0x2e,0xd5]
+
+; Check for error on invalid 'C' operand value.
+  sys #2, c16, c5, #7
+; CHECK-ERRORS: invalid operand for instruction
+
+;-----------------------------------------------------------------------------
+; MSR/MRS instructions
+;-----------------------------------------------------------------------------
+  msr ACTLR_EL1, x3
+  msr ACTLR_EL2, x3
+  msr ACTLR_EL3, x3
+  msr ADFSR_EL1, x3
+  msr ADFSR_EL2, x3
+  msr ADFSR_EL3, x3
+  msr AIDR_EL1, x3
+  msr AIFSR_EL1, x3
+  msr AIFSR_EL2, x3
+  msr AIFSR_EL3, x3
+  msr AMAIR_EL1, x3
+  msr AMAIR_EL2, x3
+  msr AMAIR_EL3, x3
+  msr CCSIDR_EL1, x3
+  msr CLIDR_EL1, x3
+  msr CNTFRQ_EL0, x3
+  msr CNTHCTL_EL2, x3
+  msr CNTHP_CTL_EL2, x3
+  msr CNTHP_CVAL_EL2, x3
+  msr CNTHP_TVAL_EL2, x3
+  msr CNTKCTL_EL1, x3
+  msr CNTPCT_EL0, x3
+  msr CNTP_CTL_EL0, x3
+  msr CNTP_CVAL_EL0, x3
+  msr CNTP_TVAL_EL0, x3
+  msr CNTVCT_EL0, x3
+  msr CNTVOFF_EL2, x3
+  msr CNTV_CTL_EL0, x3
+  msr CNTV_CVAL_EL0, x3
+  msr CNTV_TVAL_EL0, x3
+  msr CONTEXTIDR_EL1, x3
+  msr CPACR_EL1, x3
+  msr CPTR_EL2, x3
+  msr CPTR_EL3, x3
+  msr CSSELR_EL1, x3
+  msr CTR_EL0, x3
+  msr CURRENT_EL, x3
+  msr DACR32_EL2, x3
+  msr DCZID_EL0, x3
+  msr ECOIDR_EL1, x3
+  msr ESR_EL1, x3
+  msr ESR_EL2, x3
+  msr ESR_EL3, x3
+  msr FAR_EL1, x3
+  msr FAR_EL2, x3
+  msr FAR_EL3, x3
+  msr FPEXC32_EL2, x3
+  msr HACR_EL2, x3
+  msr HCR_EL2, x3
+  msr HPFAR_EL2, x3
+  msr HSTR_EL2, x3
+  msr ID_AA64DFR0_EL1, x3
+  msr ID_AA64DFR1_EL1, x3
+  msr ID_AA64ISAR0_EL1, x3
+  msr ID_AA64ISAR1_EL1, x3
+  msr ID_AA64MMFR0_EL1, x3
+  msr ID_AA64MMFR1_EL1, x3
+  msr ID_AA64PFR0_EL1, x3
+  msr ID_AA64PFR1_EL1, x3
+  msr IFSR32_EL2, x3
+  msr ISR_EL1, x3
+  msr MAIR_EL1, x3
+  msr MAIR_EL2, x3
+  msr MAIR_EL3, x3
+  msr MDCR_EL2, x3
+  msr MDCR_EL3, x3
+  msr MIDR_EL1, x3
+  msr MPIDR_EL1, x3
+  msr MVFR0_EL1, x3
+  msr MVFR1_EL1, x3
+  msr PAR_EL1, x3
+  msr RVBAR_EL1, x3
+  msr RVBAR_EL2, x3
+  msr RVBAR_EL3, x3
+  msr SCR_EL3, x3
+  msr SCTLR_EL1, x3
+  msr SCTLR_EL2, x3
+  msr SCTLR_EL3, x3
+  msr SDER32_EL3, x3
+  msr TCR_EL1, x3
+  msr TCR_EL2, x3
+  msr TCR_EL3, x3
+  msr TEECR32_EL1, x3
+  msr TEEHBR32_EL1, x3
+  msr TPIDRRO_EL0, x3
+  msr TPIDR_EL0, x3
+  msr TPIDR_EL1, x3
+  msr TPIDR_EL2, x3
+  msr TPIDR_EL3, x3
+  msr TTBR0_EL1, x3
+  msr TTBR0_EL2, x3
+  msr TTBR0_EL3, x3
+  msr TTBR1_EL1, x3
+  msr VBAR_EL1, x3
+  msr VBAR_EL2, x3
+  msr VBAR_EL3, x3
+  msr VMPIDR_EL2, x3
+  msr VPIDR_EL2, x3
+  msr VTCR_EL2, x3
+  msr VTTBR_EL2, x3
+  msr SPSel, x3
+  msr S2_2_C4_C6_4, x1
+; CHECK: msr ACTLR_EL1, x3              ; encoding: [0x23,0x10,0x18,0xd5]
+; CHECK: msr ACTLR_EL2, x3              ; encoding: [0x23,0x10,0x1c,0xd5]
+; CHECK: msr ACTLR_EL3, x3              ; encoding: [0x23,0x10,0x1e,0xd5]
+; CHECK: msr AFSR0_EL1, x3              ; encoding: [0x03,0x51,0x18,0xd5]
+; CHECK: msr ADFSR_EL2, x3              ; encoding: [0x03,0x51,0x1c,0xd5]
+; CHECK: msr ADFSR_EL3, x3              ; encoding: [0x03,0x51,0x1e,0xd5]
+; CHECK: msr AIDR_EL1, x3               ; encoding: [0xe3,0x00,0x19,0xd5]
+; CHECK: msr AFSR1_EL1, x3              ; encoding: [0x23,0x51,0x18,0xd5]
+; CHECK: msr AIFSR_EL2, x3              ; encoding: [0x23,0x51,0x1c,0xd5]
+; CHECK: msr AIFSR_EL3, x3              ; encoding: [0x23,0x51,0x1e,0xd5]
+; CHECK: msr AMAIR_EL1, x3              ; encoding: [0x03,0xa3,0x18,0xd5]
+; CHECK: msr AMAIR_EL2, x3              ; encoding: [0x03,0xa3,0x1c,0xd5]
+; CHECK: msr AMAIR_EL3, x3              ; encoding: [0x03,0xa3,0x1e,0xd5]
+; CHECK: msr CCSIDR_EL1, x3             ; encoding: [0x03,0x00,0x19,0xd5]
+; CHECK: msr CLIDR_EL1, x3              ; encoding: [0x23,0x00,0x19,0xd5]
+; CHECK: msr CNTFRQ_EL0, x3             ; encoding: [0x03,0xe0,0x1b,0xd5]
+; CHECK: msr CNTHCTL_EL2, x3            ; encoding: [0x03,0xe1,0x1c,0xd5]
+; CHECK: msr CNTHP_CTL_EL2, x3          ; encoding: [0x23,0xe2,0x1c,0xd5]
+; CHECK: msr CNTHP_CVAL_EL2, x3         ; encoding: [0x43,0xe2,0x1c,0xd5]
+; CHECK: msr CNTHP_TVAL_EL2, x3         ; encoding: [0x03,0xe2,0x1c,0xd5]
+; CHECK: msr CNTKCTL_EL1, x3            ; encoding: [0x03,0xe1,0x18,0xd5]
+; CHECK: msr CNTPCT_EL0, x3             ; encoding: [0x23,0xe0,0x1b,0xd5]
+; CHECK: msr CNTP_CTL_EL0, x3           ; encoding: [0x23,0xe2,0x1b,0xd5]
+; CHECK: msr CNTP_CVAL_EL0, x3          ; encoding: [0x43,0xe2,0x1b,0xd5]
+; CHECK: msr CNTP_TVAL_EL0, x3          ; encoding: [0x03,0xe2,0x1b,0xd5]
+; CHECK: msr CNTVCT_EL0, x3             ; encoding: [0x43,0xe0,0x1b,0xd5]
+; CHECK: msr CNTVOFF_EL2, x3            ; encoding: [0x63,0xe0,0x1c,0xd5]
+; CHECK: msr CNTV_CTL_EL0, x3           ; encoding: [0x23,0xe3,0x1b,0xd5]
+; CHECK: msr CNTV_CVAL_EL0, x3          ; encoding: [0x43,0xe3,0x1b,0xd5]
+; CHECK: msr CNTV_TVAL_EL0, x3          ; encoding: [0x03,0xe3,0x1b,0xd5]
+; CHECK: msr CONTEXTIDR_EL1, x3         ; encoding: [0x23,0xd0,0x18,0xd5]
+; CHECK: msr CPACR_EL1, x3              ; encoding: [0x43,0x10,0x18,0xd5]
+; CHECK: msr CPTR_EL2, x3               ; encoding: [0x43,0x11,0x1c,0xd5]
+; CHECK: msr CPTR_EL3, x3               ; encoding: [0x43,0x11,0x1e,0xd5]
+; CHECK: msr CSSELR_EL1, x3             ; encoding: [0x03,0x00,0x1a,0xd5]
+; CHECK: msr CTR_EL0, x3                ; encoding: [0x23,0x00,0x1b,0xd5]
+; CHECK: msr CurrentEL, x3              ; encoding: [0x43,0x42,0x18,0xd5]
+; CHECK: msr DACR32_EL2, x3             ; encoding: [0x03,0x30,0x1c,0xd5]
+; CHECK: msr DCZID_EL0, x3              ; encoding: [0xe3,0x00,0x1b,0xd5]
+; CHECK: msr REVIDR_EL1, x3             ; encoding: [0xc3,0x00,0x18,0xd5]
+; CHECK: msr ESR_EL1, x3                ; encoding: [0x03,0x52,0x18,0xd5]
+; CHECK: msr ESR_EL2, x3                ; encoding: [0x03,0x52,0x1c,0xd5]
+; CHECK: msr ESR_EL3, x3                ; encoding: [0x03,0x52,0x1e,0xd5]
+; CHECK: msr FAR_EL1, x3                ; encoding: [0x03,0x60,0x18,0xd5]
+; CHECK: msr FAR_EL2, x3                ; encoding: [0x03,0x60,0x1c,0xd5]
+; CHECK: msr FAR_EL3, x3                ; encoding: [0x03,0x60,0x1e,0xd5]
+; CHECK: msr FPEXC32_EL2, x3            ; encoding: [0x03,0x53,0x1c,0xd5]
+; CHECK: msr HACR_EL2, x3               ; encoding: [0xe3,0x11,0x1c,0xd5]
+; CHECK: msr HCR_EL2, x3                ; encoding: [0x03,0x11,0x1c,0xd5]
+; CHECK: msr HPFAR_EL2, x3              ; encoding: [0x83,0x60,0x1c,0xd5]
+; CHECK: msr HSTR_EL2, x3               ; encoding: [0x63,0x11,0x1c,0xd5]
+; CHECK: msr ID_AA64DFR0_EL1, x3        ; encoding: [0x03,0x05,0x18,0xd5]
+; CHECK: msr ID_AA64DFR1_EL1, x3        ; encoding: [0x23,0x05,0x18,0xd5]
+; CHECK: msr ID_AA64ISAR0_EL1, x3       ; encoding: [0x03,0x06,0x18,0xd5]
+; CHECK: msr ID_AA64ISAR1_EL1, x3       ; encoding: [0x23,0x06,0x18,0xd5]
+; CHECK: msr ID_AA64MMFR0_EL1, x3       ; encoding: [0x03,0x07,0x18,0xd5]
+; CHECK: msr ID_AA64MMFR1_EL1, x3       ; encoding: [0x23,0x07,0x18,0xd5]
+; CHECK: msr ID_AA64PFR0_EL1, x3        ; encoding: [0x03,0x04,0x18,0xd5]
+; CHECK: msr ID_AA64PFR1_EL1, x3        ; encoding: [0x23,0x04,0x18,0xd5]
+; CHECK: msr IFSR32_EL2, x3             ; encoding: [0x23,0x50,0x1c,0xd5]
+; CHECK: msr ISR_EL1, x3                ; encoding: [0x03,0xc1,0x18,0xd5]
+; CHECK: msr MAIR_EL1, x3               ; encoding: [0x03,0xa2,0x18,0xd5]
+; CHECK: msr MAIR_EL2, x3               ; encoding: [0x03,0xa2,0x1c,0xd5]
+; CHECK: msr MAIR_EL3, x3               ; encoding: [0x03,0xa2,0x1e,0xd5]
+; CHECK: msr MDCR_EL2, x3               ; encoding: [0x23,0x11,0x1c,0xd5]
+; CHECK: msr MDCR_EL3, x3               ; encoding: [0x23,0x13,0x1e,0xd5]
+; CHECK: msr MIDR_EL1, x3               ; encoding: [0x03,0x00,0x18,0xd5]
+; CHECK: msr MPIDR_EL1, x3              ; encoding: [0xa3,0x00,0x18,0xd5]
+; CHECK: msr MVFR0_EL1, x3              ; encoding: [0x03,0x03,0x18,0xd5]
+; CHECK: msr MVFR1_EL1, x3              ; encoding: [0x23,0x03,0x18,0xd5]
+; CHECK: msr PAR_EL1, x3                ; encoding: [0x03,0x74,0x18,0xd5]
+; CHECK: msr RVBAR_EL1, x3              ; encoding: [0x23,0xc0,0x18,0xd5]
+; CHECK: msr RVBAR_EL2, x3              ; encoding: [0x23,0xc0,0x1c,0xd5]
+; CHECK: msr RVBAR_EL3, x3              ; encoding: [0x23,0xc0,0x1e,0xd5]
+; CHECK: msr SCR_EL3, x3                ; encoding: [0x03,0x11,0x1e,0xd5]
+; CHECK: msr SCTLR_EL1, x3              ; encoding: [0x03,0x10,0x18,0xd5]
+; CHECK: msr SCTLR_EL2, x3              ; encoding: [0x03,0x10,0x1c,0xd5]
+; CHECK: msr SCTLR_EL3, x3              ; encoding: [0x03,0x10,0x1e,0xd5]
+; CHECK: msr SDER32_EL3, x3             ; encoding: [0x23,0x11,0x1e,0xd5]
+; CHECK: msr TCR_EL1, x3                ; encoding: [0x43,0x20,0x18,0xd5]
+; CHECK: msr TCR_EL2, x3                ; encoding: [0x43,0x20,0x1c,0xd5]
+; CHECK: msr TCR_EL3, x3                ; encoding: [0x43,0x20,0x1e,0xd5]
+; CHECK: msr TEECR32_EL1, x3            ; encoding: [0x03,0x00,0x12,0xd5]
+; CHECK: msr TEEHBR32_EL1, x3           ; encoding: [0x03,0x10,0x12,0xd5]
+; CHECK: msr TPIDRRO_EL0, x3            ; encoding: [0x63,0xd0,0x1b,0xd5]
+; CHECK: msr TPIDR_EL0, x3              ; encoding: [0x43,0xd0,0x1b,0xd5]
+; CHECK: msr TPIDR_EL1, x3              ; encoding: [0x83,0xd0,0x18,0xd5]
+; CHECK: msr TPIDR_EL2, x3              ; encoding: [0x43,0xd0,0x1c,0xd5]
+; CHECK: msr TPIDR_EL3, x3              ; encoding: [0x43,0xd0,0x1e,0xd5]
+; CHECK: msr TTBR0_EL1, x3              ; encoding: [0x03,0x20,0x18,0xd5]
+; CHECK: msr TTBR0_EL2, x3              ; encoding: [0x03,0x20,0x1c,0xd5]
+; CHECK: msr TTBR0_EL3, x3              ; encoding: [0x03,0x20,0x1e,0xd5]
+; CHECK: msr TTBR1_EL1, x3              ; encoding: [0x23,0x20,0x18,0xd5]
+; CHECK: msr VBAR_EL1, x3               ; encoding: [0x03,0xc0,0x18,0xd5]
+; CHECK: msr VBAR_EL2, x3               ; encoding: [0x03,0xc0,0x1c,0xd5]
+; CHECK: msr VBAR_EL3, x3               ; encoding: [0x03,0xc0,0x1e,0xd5]
+; CHECK: msr VMPIDR_EL2, x3             ; encoding: [0xa3,0x00,0x1c,0xd5]
+; CHECK: msr VPIDR_EL2, x3              ; encoding: [0x03,0x00,0x1c,0xd5]
+; CHECK: msr VTCR_EL2, x3               ; encoding: [0x43,0x21,0x1c,0xd5]
+; CHECK: msr VTTBR_EL2, x3              ; encoding: [0x03,0x21,0x1c,0xd5]
+; CHECK: msr  SPSel, x3                 ; encoding: [0x03,0x42,0x18,0xd5]
+; CHECK: msr  S2_2_C4_C6_4, x1          ; encoding: [0x81,0x46,0x12,0xd5]
+
+  mrs x3, ACTLR_EL1
+  mrs x3, ACTLR_EL2
+  mrs x3, ACTLR_EL3
+  mrs x3, ADFSR_EL1
+  mrs x3, ADFSR_EL2
+  mrs x3, ADFSR_EL3
+  mrs x3, AIDR_EL1
+  mrs x3, AIFSR_EL1
+  mrs x3, AIFSR_EL2
+  mrs x3, AIFSR_EL3
+  mrs x3, AMAIR_EL1
+  mrs x3, AMAIR_EL2
+  mrs x3, AMAIR_EL3
+  mrs x3, CCSIDR_EL1
+  mrs x3, CLIDR_EL1
+  mrs x3, CNTFRQ_EL0
+  mrs x3, CNTHCTL_EL2
+  mrs x3, CNTHP_CTL_EL2
+  mrs x3, CNTHP_CVAL_EL2
+  mrs x3, CNTHP_TVAL_EL2
+  mrs x3, CNTKCTL_EL1
+  mrs x3, CNTPCT_EL0
+  mrs x3, CNTP_CTL_EL0
+  mrs x3, CNTP_CVAL_EL0
+  mrs x3, CNTP_TVAL_EL0
+  mrs x3, CNTVCT_EL0
+  mrs x3, CNTVOFF_EL2
+  mrs x3, CNTV_CTL_EL0
+  mrs x3, CNTV_CVAL_EL0
+  mrs x3, CNTV_TVAL_EL0
+  mrs x3, CONTEXTIDR_EL1
+  mrs x3, CPACR_EL1
+  mrs x3, CPTR_EL2
+  mrs x3, CPTR_EL3
+  mrs x3, CSSELR_EL1
+  mrs x3, CTR_EL0
+  mrs x3, CURRENT_EL
+  mrs x3, DACR32_EL2
+  mrs x3, DCZID_EL0
+  mrs x3, ECOIDR_EL1
+  mrs x3, ESR_EL1
+  mrs x3, ESR_EL2
+  mrs x3, ESR_EL3
+  mrs x3, FAR_EL1
+  mrs x3, FAR_EL2
+  mrs x3, FAR_EL3
+  mrs x3, FPEXC32_EL2
+  mrs x3, HACR_EL2
+  mrs x3, HCR_EL2
+  mrs x3, HPFAR_EL2
+  mrs x3, HSTR_EL2
+  mrs x3, ID_AA64DFR0_EL1
+  mrs x3, ID_AA64DFR1_EL1
+  mrs x3, ID_AA64ISAR0_EL1
+  mrs x3, ID_AA64ISAR1_EL1
+  mrs x3, ID_AA64MMFR0_EL1
+  mrs x3, ID_AA64MMFR1_EL1
+  mrs x3, ID_AA64PFR0_EL1
+  mrs x3, ID_AA64PFR1_EL1
+  mrs x3, IFSR32_EL2
+  mrs x3, ISR_EL1
+  mrs x3, MAIR_EL1
+  mrs x3, MAIR_EL2
+  mrs x3, MAIR_EL3
+  mrs x3, MDCR_EL2
+  mrs x3, MDCR_EL3
+  mrs x3, MIDR_EL1
+  mrs x3, MPIDR_EL1
+  mrs x3, MVFR0_EL1
+  mrs x3, MVFR1_EL1
+  mrs x3, PAR_EL1
+  mrs x3, RVBAR_EL1
+  mrs x3, RVBAR_EL2
+  mrs x3, RVBAR_EL3
+  mrs x3, SCR_EL3
+  mrs x3, SCTLR_EL1
+  mrs x3, SCTLR_EL2
+  mrs x3, SCTLR_EL3
+  mrs x3, SDER32_EL3
+  mrs x3, TCR_EL1
+  mrs x3, TCR_EL2
+  mrs x3, TCR_EL3
+  mrs x3, TEECR32_EL1
+  mrs x3, TEEHBR32_EL1
+  mrs x3, TPIDRRO_EL0
+  mrs x3, TPIDR_EL0
+  mrs x3, TPIDR_EL1
+  mrs x3, TPIDR_EL2
+  mrs x3, TPIDR_EL3
+  mrs x3, TTBR0_EL1
+  mrs x3, TTBR0_EL2
+  mrs x3, TTBR0_EL3
+  mrs x3, TTBR1_EL1
+  mrs x3, VBAR_EL1
+  mrs x3, VBAR_EL2
+  mrs x3, VBAR_EL3
+  mrs x3, VMPIDR_EL2
+  mrs x3, VPIDR_EL2
+  mrs x3, VTCR_EL2
+  mrs x3, VTTBR_EL2
+
+  mrs x3, MDCCSR_EL0
+  mrs x3, MDCCINT_EL1
+  mrs x3, DBGDTR_EL0
+  mrs x3, DBGDTRRX_EL0
+  mrs x3, DBGDTRTX_EL0
+  mrs x3, DBGVCR32_EL2
+  mrs x3, OSDTRRX_EL1
+  mrs x3, MDSCR_EL1
+  mrs x3, OSDTRTX_EL1
+  mrs x3, OSECCR_EL11
+  mrs x3, DBGBVR0_EL1
+  mrs x3, DBGBVR1_EL1
+  mrs x3, DBGBVR2_EL1
+  mrs x3, DBGBVR3_EL1
+  mrs x3, DBGBVR4_EL1
+  mrs x3, DBGBVR5_EL1
+  mrs x3, DBGBVR6_EL1
+  mrs x3, DBGBVR7_EL1
+  mrs x3, DBGBVR8_EL1
+  mrs x3, DBGBVR9_EL1
+  mrs x3, DBGBVR10_EL1
+  mrs x3, DBGBVR11_EL1
+  mrs x3, DBGBVR12_EL1
+  mrs x3, DBGBVR13_EL1
+  mrs x3, DBGBVR14_EL1
+  mrs x3, DBGBVR15_EL1
+  mrs x3, DBGBCR0_EL1
+  mrs x3, DBGBCR1_EL1
+  mrs x3, DBGBCR2_EL1
+  mrs x3, DBGBCR3_EL1
+  mrs x3, DBGBCR4_EL1
+  mrs x3, DBGBCR5_EL1
+  mrs x3, DBGBCR6_EL1
+  mrs x3, DBGBCR7_EL1
+  mrs x3, DBGBCR8_EL1
+  mrs x3, DBGBCR9_EL1
+  mrs x3, DBGBCR10_EL1
+  mrs x3, DBGBCR11_EL1
+  mrs x3, DBGBCR12_EL1
+  mrs x3, DBGBCR13_EL1
+  mrs x3, DBGBCR14_EL1
+  mrs x3, DBGBCR15_EL1
+  mrs x3, DBGWVR0_EL1
+  mrs x3, DBGWVR1_EL1
+  mrs x3, DBGWVR2_EL1
+  mrs x3, DBGWVR3_EL1
+  mrs x3, DBGWVR4_EL1
+  mrs x3, DBGWVR5_EL1
+  mrs x3, DBGWVR6_EL1
+  mrs x3, DBGWVR7_EL1
+  mrs x3, DBGWVR8_EL1
+  mrs x3, DBGWVR9_EL1
+  mrs x3, DBGWVR10_EL1
+  mrs x3, DBGWVR11_EL1
+  mrs x3, DBGWVR12_EL1
+  mrs x3, DBGWVR13_EL1
+  mrs x3, DBGWVR14_EL1
+  mrs x3, DBGWVR15_EL1
+  mrs x3, DBGWCR0_EL1
+  mrs x3, DBGWCR1_EL1
+  mrs x3, DBGWCR2_EL1
+  mrs x3, DBGWCR3_EL1
+  mrs x3, DBGWCR4_EL1
+  mrs x3, DBGWCR5_EL1
+  mrs x3, DBGWCR6_EL1
+  mrs x3, DBGWCR7_EL1
+  mrs x3, DBGWCR8_EL1
+  mrs x3, DBGWCR9_EL1
+  mrs x3, DBGWCR10_EL1
+  mrs x3, DBGWCR11_EL1
+  mrs x3, DBGWCR12_EL1
+  mrs x3, DBGWCR13_EL1
+  mrs x3, DBGWCR14_EL1
+  mrs x3, DBGWCR15_EL1
+  mrs x3, MDRAR_EL1
+  mrs x3, OSLAR_EL1
+  mrs x3, OSLSR_EL1
+  mrs x3, OSDLR_EL1
+  mrs x3, DBGPRCR_EL1
+  mrs x3, DBGCLAIMSET_EL1
+  mrs x3, DBGCLAIMCLR_EL1
+  mrs x3, DBGAUTHSTATUS_EL1
+  mrs x3, DBGDEVID2
+  mrs x3, DBGDEVID1
+  mrs x3, DBGDEVID0
+  mrs x1, S2_2_C4_C6_4
+  mrs x3, s2_3_c2_c1_4
+  mrs x3, S2_3_c2_c1_4
+
+; CHECK: mrs x3, ACTLR_EL1              ; encoding: [0x23,0x10,0x38,0xd5]
+; CHECK: mrs x3, ACTLR_EL2              ; encoding: [0x23,0x10,0x3c,0xd5]
+; CHECK: mrs x3, ACTLR_EL3              ; encoding: [0x23,0x10,0x3e,0xd5]
+; CHECK: mrs x3, AFSR0_EL1              ; encoding: [0x03,0x51,0x38,0xd5]
+; CHECK: mrs x3, ADFSR_EL2              ; encoding: [0x03,0x51,0x3c,0xd5]
+; CHECK: mrs x3, ADFSR_EL3              ; encoding: [0x03,0x51,0x3e,0xd5]
+; CHECK: mrs x3, AIDR_EL1               ; encoding: [0xe3,0x00,0x39,0xd5]
+; CHECK: mrs x3, AFSR1_EL1              ; encoding: [0x23,0x51,0x38,0xd5]
+; CHECK: mrs x3, AIFSR_EL2              ; encoding: [0x23,0x51,0x3c,0xd5]
+; CHECK: mrs x3, AIFSR_EL3              ; encoding: [0x23,0x51,0x3e,0xd5]
+; CHECK: mrs x3, AMAIR_EL1              ; encoding: [0x03,0xa3,0x38,0xd5]
+; CHECK: mrs x3, AMAIR_EL2              ; encoding: [0x03,0xa3,0x3c,0xd5]
+; CHECK: mrs x3, AMAIR_EL3              ; encoding: [0x03,0xa3,0x3e,0xd5]
+; CHECK: mrs x3, CCSIDR_EL1             ; encoding: [0x03,0x00,0x39,0xd5]
+; CHECK: mrs x3, CLIDR_EL1              ; encoding: [0x23,0x00,0x39,0xd5]
+; CHECK: mrs x3, CNTFRQ_EL0             ; encoding: [0x03,0xe0,0x3b,0xd5]
+; CHECK: mrs x3, CNTHCTL_EL2            ; encoding: [0x03,0xe1,0x3c,0xd5]
+; CHECK: mrs x3, CNTHP_CTL_EL2          ; encoding: [0x23,0xe2,0x3c,0xd5]
+; CHECK: mrs x3, CNTHP_CVAL_EL2         ; encoding: [0x43,0xe2,0x3c,0xd5]
+; CHECK: mrs x3, CNTHP_TVAL_EL2         ; encoding: [0x03,0xe2,0x3c,0xd5]
+; CHECK: mrs x3, CNTKCTL_EL1            ; encoding: [0x03,0xe1,0x38,0xd5]
+; CHECK: mrs x3, CNTPCT_EL0             ; encoding: [0x23,0xe0,0x3b,0xd5]
+; CHECK: mrs x3, CNTP_CTL_EL0           ; encoding: [0x23,0xe2,0x3b,0xd5]
+; CHECK: mrs x3, CNTP_CVAL_EL0          ; encoding: [0x43,0xe2,0x3b,0xd5]
+; CHECK: mrs x3, CNTP_TVAL_EL0          ; encoding: [0x03,0xe2,0x3b,0xd5]
+; CHECK: mrs x3, CNTVCT_EL0             ; encoding: [0x43,0xe0,0x3b,0xd5]
+; CHECK: mrs x3, CNTVOFF_EL2            ; encoding: [0x63,0xe0,0x3c,0xd5]
+; CHECK: mrs x3, CNTV_CTL_EL0           ; encoding: [0x23,0xe3,0x3b,0xd5]
+; CHECK: mrs x3, CNTV_CVAL_EL0          ; encoding: [0x43,0xe3,0x3b,0xd5]
+; CHECK: mrs x3, CNTV_TVAL_EL0          ; encoding: [0x03,0xe3,0x3b,0xd5]
+; CHECK: mrs x3, CONTEXTIDR_EL1         ; encoding: [0x23,0xd0,0x38,0xd5]
+; CHECK: mrs x3, CPACR_EL1              ; encoding: [0x43,0x10,0x38,0xd5]
+; CHECK: mrs x3, CPTR_EL2               ; encoding: [0x43,0x11,0x3c,0xd5]
+; CHECK: mrs x3, CPTR_EL3               ; encoding: [0x43,0x11,0x3e,0xd5]
+; CHECK: mrs x3, CSSELR_EL1             ; encoding: [0x03,0x00,0x3a,0xd5]
+; CHECK: mrs x3, CTR_EL0                ; encoding: [0x23,0x00,0x3b,0xd5]
+; CHECK: mrs x3, CurrentEL              ; encoding: [0x43,0x42,0x38,0xd5]
+; CHECK: mrs x3, DACR32_EL2             ; encoding: [0x03,0x30,0x3c,0xd5]
+; CHECK: mrs x3, DCZID_EL0              ; encoding: [0xe3,0x00,0x3b,0xd5]
+; CHECK: mrs x3, REVIDR_EL1             ; encoding: [0xc3,0x00,0x38,0xd5]
+; CHECK: mrs x3, ESR_EL1                ; encoding: [0x03,0x52,0x38,0xd5]
+; CHECK: mrs x3, ESR_EL2                ; encoding: [0x03,0x52,0x3c,0xd5]
+; CHECK: mrs x3, ESR_EL3                ; encoding: [0x03,0x52,0x3e,0xd5]
+; CHECK: mrs x3, FAR_EL1                ; encoding: [0x03,0x60,0x38,0xd5]
+; CHECK: mrs x3, FAR_EL2                ; encoding: [0x03,0x60,0x3c,0xd5]
+; CHECK: mrs x3, FAR_EL3                ; encoding: [0x03,0x60,0x3e,0xd5]
+; CHECK: mrs x3, FPEXC32_EL2            ; encoding: [0x03,0x53,0x3c,0xd5]
+; CHECK: mrs x3, HACR_EL2               ; encoding: [0xe3,0x11,0x3c,0xd5]
+; CHECK: mrs x3, HCR_EL2                ; encoding: [0x03,0x11,0x3c,0xd5]
+; CHECK: mrs x3, HPFAR_EL2              ; encoding: [0x83,0x60,0x3c,0xd5]
+; CHECK: mrs x3, HSTR_EL2               ; encoding: [0x63,0x11,0x3c,0xd5]
+; CHECK: mrs x3, ID_AA64DFR0_EL1        ; encoding: [0x03,0x05,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64DFR1_EL1        ; encoding: [0x23,0x05,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64ISAR0_EL1       ; encoding: [0x03,0x06,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64ISAR1_EL1       ; encoding: [0x23,0x06,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64MMFR0_EL1       ; encoding: [0x03,0x07,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64MMFR1_EL1       ; encoding: [0x23,0x07,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64PFR0_EL1        ; encoding: [0x03,0x04,0x38,0xd5]
+; CHECK: mrs x3, ID_AA64PFR1_EL1        ; encoding: [0x23,0x04,0x38,0xd5]
+; CHECK: mrs x3, IFSR32_EL2             ; encoding: [0x23,0x50,0x3c,0xd5]
+; CHECK: mrs x3, ISR_EL1                ; encoding: [0x03,0xc1,0x38,0xd5]
+; CHECK: mrs x3, MAIR_EL1               ; encoding: [0x03,0xa2,0x38,0xd5]
+; CHECK: mrs x3, MAIR_EL2               ; encoding: [0x03,0xa2,0x3c,0xd5]
+; CHECK: mrs x3, MAIR_EL3               ; encoding: [0x03,0xa2,0x3e,0xd5]
+; CHECK: mrs x3, MDCR_EL2               ; encoding: [0x23,0x11,0x3c,0xd5]
+; CHECK: mrs x3, MDCR_EL3               ; encoding: [0x23,0x13,0x3e,0xd5]
+; CHECK: mrs x3, MIDR_EL1               ; encoding: [0x03,0x00,0x38,0xd5]
+; CHECK: mrs x3, MPIDR_EL1              ; encoding: [0xa3,0x00,0x38,0xd5]
+; CHECK: mrs x3, MVFR0_EL1              ; encoding: [0x03,0x03,0x38,0xd5]
+; CHECK: mrs x3, MVFR1_EL1              ; encoding: [0x23,0x03,0x38,0xd5]
+; CHECK: mrs x3, PAR_EL1                ; encoding: [0x03,0x74,0x38,0xd5]
+; CHECK: mrs x3, RVBAR_EL1              ; encoding: [0x23,0xc0,0x38,0xd5]
+; CHECK: mrs x3, RVBAR_EL2              ; encoding: [0x23,0xc0,0x3c,0xd5]
+; CHECK: mrs x3, RVBAR_EL3              ; encoding: [0x23,0xc0,0x3e,0xd5]
+; CHECK: mrs x3, SCR_EL3                ; encoding: [0x03,0x11,0x3e,0xd5]
+; CHECK: mrs x3, SCTLR_EL1              ; encoding: [0x03,0x10,0x38,0xd5]
+; CHECK: mrs x3, SCTLR_EL2              ; encoding: [0x03,0x10,0x3c,0xd5]
+; CHECK: mrs x3, SCTLR_EL3              ; encoding: [0x03,0x10,0x3e,0xd5]
+; CHECK: mrs x3, SDER32_EL3             ; encoding: [0x23,0x11,0x3e,0xd5]
+; CHECK: mrs x3, TCR_EL1                ; encoding: [0x43,0x20,0x38,0xd5]
+; CHECK: mrs x3, TCR_EL2                ; encoding: [0x43,0x20,0x3c,0xd5]
+; CHECK: mrs x3, TCR_EL3                ; encoding: [0x43,0x20,0x3e,0xd5]
+; CHECK: mrs x3, TEECR32_EL1            ; encoding: [0x03,0x00,0x32,0xd5]
+; CHECK: mrs x3, TEEHBR32_EL1           ; encoding: [0x03,0x10,0x32,0xd5]
+; CHECK: mrs x3, TPIDRRO_EL0            ; encoding: [0x63,0xd0,0x3b,0xd5]
+; CHECK: mrs x3, TPIDR_EL0              ; encoding: [0x43,0xd0,0x3b,0xd5]
+; CHECK: mrs x3, TPIDR_EL1              ; encoding: [0x83,0xd0,0x38,0xd5]
+; CHECK: mrs x3, TPIDR_EL2              ; encoding: [0x43,0xd0,0x3c,0xd5]
+; CHECK: mrs x3, TPIDR_EL3              ; encoding: [0x43,0xd0,0x3e,0xd5]
+; CHECK: mrs x3, TTBR0_EL1              ; encoding: [0x03,0x20,0x38,0xd5]
+; CHECK: mrs x3, TTBR0_EL2              ; encoding: [0x03,0x20,0x3c,0xd5]
+; CHECK: mrs x3, TTBR0_EL3              ; encoding: [0x03,0x20,0x3e,0xd5]
+; CHECK: mrs x3, TTBR1_EL1              ; encoding: [0x23,0x20,0x38,0xd5]
+; CHECK: mrs x3, VBAR_EL1               ; encoding: [0x03,0xc0,0x38,0xd5]
+; CHECK: mrs x3, VBAR_EL2               ; encoding: [0x03,0xc0,0x3c,0xd5]
+; CHECK: mrs x3, VBAR_EL3               ; encoding: [0x03,0xc0,0x3e,0xd5]
+; CHECK: mrs x3, VMPIDR_EL2             ; encoding: [0xa3,0x00,0x3c,0xd5]
+; CHECK: mrs x3, VPIDR_EL2              ; encoding: [0x03,0x00,0x3c,0xd5]
+; CHECK: mrs x3, VTCR_EL2               ; encoding: [0x43,0x21,0x3c,0xd5]
+; CHECK: mrs x3, VTTBR_EL2              ; encoding: [0x03,0x21,0x3c,0xd5]
+; CHECK: mrs	x3, MDCCSR_EL0          ; encoding: [0x03,0x01,0x33,0xd5]
+; CHECK: mrs	x3, MDCCINT_EL1         ; encoding: [0x03,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGDTR_EL0          ; encoding: [0x03,0x04,0x33,0xd5]
+; CHECK: mrs	x3, DBGDTRRX_EL0        ; encoding: [0x03,0x05,0x33,0xd5]
+; CHECK: mrs	x3, DBGDTRRX_EL0        ; encoding: [0x03,0x05,0x33,0xd5]
+; CHECK: mrs	x3, DBGVCR32_EL2        ; encoding: [0x03,0x07,0x34,0xd5]
+; CHECK: mrs	x3, OSDTRRX_EL1         ; encoding: [0x43,0x00,0x30,0xd5]
+; CHECK: mrs	x3, MDSCR_EL1           ; encoding: [0x43,0x02,0x30,0xd5]
+; CHECK: mrs	x3, OSDTRTX_EL1         ; encoding: [0x43,0x03,0x30,0xd5]
+; CHECK: mrs	x3, OSECCR_EL11         ; encoding: [0x43,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR0_EL1         ; encoding: [0x83,0x00,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR1_EL1         ; encoding: [0x83,0x01,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR2_EL1         ; encoding: [0x83,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR3_EL1         ; encoding: [0x83,0x03,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR4_EL1         ; encoding: [0x83,0x04,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR5_EL1         ; encoding: [0x83,0x05,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR6_EL1         ; encoding: [0x83,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR7_EL1         ; encoding: [0x83,0x07,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR8_EL1         ; encoding: [0x83,0x08,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR9_EL1         ; encoding: [0x83,0x09,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR10_EL1        ; encoding: [0x83,0x0a,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR11_EL1        ; encoding: [0x83,0x0b,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR12_EL1        ; encoding: [0x83,0x0c,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR13_EL1        ; encoding: [0x83,0x0d,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR14_EL1        ; encoding: [0x83,0x0e,0x30,0xd5]
+; CHECK: mrs	x3, DBGBVR15_EL1        ; encoding: [0x83,0x0f,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR0_EL1         ; encoding: [0xa3,0x00,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR1_EL1         ; encoding: [0xa3,0x01,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR2_EL1         ; encoding: [0xa3,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR3_EL1         ; encoding: [0xa3,0x03,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR4_EL1         ; encoding: [0xa3,0x04,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR5_EL1         ; encoding: [0xa3,0x05,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR6_EL1         ; encoding: [0xa3,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR7_EL1         ; encoding: [0xa3,0x07,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR8_EL1         ; encoding: [0xa3,0x08,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR9_EL1         ; encoding: [0xa3,0x09,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR10_EL1        ; encoding: [0xa3,0x0a,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR11_EL1        ; encoding: [0xa3,0x0b,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR12_EL1        ; encoding: [0xa3,0x0c,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR13_EL1        ; encoding: [0xa3,0x0d,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR14_EL1        ; encoding: [0xa3,0x0e,0x30,0xd5]
+; CHECK: mrs	x3, DBGBCR15_EL1        ; encoding: [0xa3,0x0f,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR0_EL1         ; encoding: [0xc3,0x00,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR1_EL1         ; encoding: [0xc3,0x01,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR2_EL1         ; encoding: [0xc3,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR3_EL1         ; encoding: [0xc3,0x03,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR4_EL1         ; encoding: [0xc3,0x04,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR5_EL1         ; encoding: [0xc3,0x05,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR6_EL1         ; encoding: [0xc3,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR7_EL1         ; encoding: [0xc3,0x07,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR8_EL1         ; encoding: [0xc3,0x08,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR9_EL1         ; encoding: [0xc3,0x09,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR10_EL1        ; encoding: [0xc3,0x0a,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR11_EL1        ; encoding: [0xc3,0x0b,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR12_EL1        ; encoding: [0xc3,0x0c,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR13_EL1        ; encoding: [0xc3,0x0d,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR14_EL1        ; encoding: [0xc3,0x0e,0x30,0xd5]
+; CHECK: mrs	x3, DBGWVR15_EL1        ; encoding: [0xc3,0x0f,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR0_EL1         ; encoding: [0xe3,0x00,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR1_EL1         ; encoding: [0xe3,0x01,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR2_EL1         ; encoding: [0xe3,0x02,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR3_EL1         ; encoding: [0xe3,0x03,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR4_EL1         ; encoding: [0xe3,0x04,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR5_EL1         ; encoding: [0xe3,0x05,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR6_EL1         ; encoding: [0xe3,0x06,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR7_EL1         ; encoding: [0xe3,0x07,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR8_EL1         ; encoding: [0xe3,0x08,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR9_EL1         ; encoding: [0xe3,0x09,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR10_EL1        ; encoding: [0xe3,0x0a,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR11_EL1        ; encoding: [0xe3,0x0b,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR12_EL1        ; encoding: [0xe3,0x0c,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR13_EL1        ; encoding: [0xe3,0x0d,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR14_EL1        ; encoding: [0xe3,0x0e,0x30,0xd5]
+; CHECK: mrs	x3, DBGWCR15_EL1        ; encoding: [0xe3,0x0f,0x30,0xd5]
+; CHECK: mrs	x3, MDRAR_EL1           ; encoding: [0x03,0x10,0x30,0xd5]
+; CHECK: mrs	x3, OSLAR_EL1           ; encoding: [0x83,0x10,0x30,0xd5]
+; CHECK: mrs	x3, OSLSR_EL1           ; encoding: [0x83,0x11,0x30,0xd5]
+; CHECK: mrs	x3, OSDLR_EL1           ; encoding: [0x83,0x13,0x30,0xd5]
+; CHECK: mrs	x3, DBGPRCR_EL1         ; encoding: [0x83,0x14,0x30,0xd5]
+; CHECK: mrs	x3, DBGCLAIMSET_EL1     ; encoding: [0xc3,0x78,0x30,0xd5]
+; CHECK: mrs	x3, DBGCLAIMCLR_EL1     ; encoding: [0xc3,0x79,0x30,0xd5]
+; CHECK: mrs	x3, DBGAUTHSTATUS_EL1   ; encoding: [0xc3,0x7e,0x30,0xd5]
+; CHECK: mrs	x3, DBGDEVID2           ; encoding: [0xe3,0x70,0x30,0xd5]
+; CHECK: mrs	x3, DBGDEVID1           ; encoding: [0xe3,0x71,0x30,0xd5]
+; CHECK: mrs	x3, DBGDEVID0           ; encoding: [0xe3,0x72,0x30,0xd5]
+; CHECK: mrs    x1, S2_2_C4_C6_4        ; encoding: [0x81,0x46,0x32,0xd5]
+; CHECK: mrs	x3, S2_3_C2_C1_4        ; encoding: [0x83,0x21,0x33,0xd5]
+; CHECK: mrs	x3, S2_3_C2_C1_4        ; encoding: [0x83,0x21,0x33,0xd5]
+
+  msr RMR_EL3, x0
+  msr RMR_EL2, x0
+  msr RMR_EL1, x0
+  msr CPM_IOACC_CTL_EL3, x0
+
+; CHECK: msr	RMR_EL3, x0             ; encoding: [0x40,0xc0,0x1e,0xd5]
+; CHECK: msr	RMR_EL2, x0             ; encoding: [0x40,0xc0,0x1a,0xd5]
+; CHECK: msr	RMR_EL1, x0             ; encoding: [0x40,0xc0,0x19,0xd5]
+; CHECK: msr	CPM_IOACC_CTL_EL3, x0   ; encoding: [0x00,0xf2,0x1f,0xd5]
+
+ mrs x0, ID_PFR0_EL1
+ mrs x0, ID_PFR1_EL1
+ mrs x0, ID_DFR0_EL1
+ mrs x0, ID_AFR0_EL1
+ mrs x0, ID_ISAR0_EL1
+ mrs x0, ID_ISAR1_EL1
+ mrs x0, ID_ISAR2_EL1
+ mrs x0, ID_ISAR3_EL1
+ mrs x0, ID_ISAR4_EL1
+ mrs x0, ID_ISAR5_EL1
+ mrs x0, AFSR1_EL1
+ mrs x0, AFSR0_EL1
+ mrs x0, REVIDR_EL1
+; CHECK: mrs	x0, ID_PFR0_EL1         ; encoding: [0x00,0x01,0x38,0xd5]
+; CHECK: mrs	x0, ID_PFR1_EL1         ; encoding: [0x20,0x01,0x38,0xd5]
+; CHECK: mrs	x0, ID_DFR0_EL1         ; encoding: [0x40,0x01,0x38,0xd5]
+; CHECK: mrs	x0, ID_AFR0_EL1         ; encoding: [0x60,0x01,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR0_EL1        ; encoding: [0x00,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR1_EL1        ; encoding: [0x20,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR2_EL1        ; encoding: [0x40,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR3_EL1        ; encoding: [0x60,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR4_EL1        ; encoding: [0x80,0x02,0x38,0xd5]
+; CHECK: mrs	x0, ID_ISAR5_EL1        ; encoding: [0xa0,0x02,0x38,0xd5]
+; CHECK: mrs	x0, AFSR1_EL1           ; encoding: [0x20,0x51,0x38,0xd5]
+; CHECK: mrs	x0, AFSR0_EL1           ; encoding: [0x00,0x51,0x38,0xd5]
+; CHECK: mrs	x0, REVIDR_EL1          ; encoding: [0xc0,0x00,0x38,0xd5]
diff --git a/test/MC/ARM64/tls-modifiers-darwin.s b/test/MC/ARM64/tls-modifiers-darwin.s
new file mode 100644
index 0000000000..6478d2692f
--- /dev/null
+++ b/test/MC/ARM64/tls-modifiers-darwin.s
@@ -0,0 +1,13 @@
+; RUN: llvm-mc -triple=arm64-apple-ios7.0 %s -o - | FileCheck %s
+; RUN: llvm-mc -triple=arm64-apple-ios7.0 -filetype=obj %s -o - | llvm-objdump -r - | FileCheck %s --check-prefix=CHECK-OBJ
+
+        adrp x2, _var@TLVPPAGE
+        ldr x0, [x15, _var@TLVPPAGEOFF]
+        add lr, x0, _var@TLVPPAGEOFF
+; CHECK: adrp x2, _var@TLVPPAG
+; CHECK: ldr x0, [x15, _var@TLVPPAGEOFF]
+; CHECK: add lr, x0, _var@TLVPPAGEOFF
+
+; CHECK-OBJ: 8 ARM64_RELOC_TLVP_LOAD_PAGEOFF12 _var
+; CHECK-OBJ: 4 ARM64_RELOC_TLVP_LOAD_PAGEOFF12 _var
+; CHECK-OBJ: 0 ARM64_RELOC_TLVP_LOAD_PAGE21 _var
diff --git a/test/MC/ARM64/tls-relocs.s b/test/MC/ARM64/tls-relocs.s
new file mode 100644
index 0000000000..7e8b7545b4
--- /dev/null
+++ b/test/MC/ARM64/tls-relocs.s
@@ -0,0 +1,320 @@
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -show-encoding < %s | FileCheck %s
+// RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj < %s -o - | \
+// RUN:   llvm-readobj -r -t | FileCheck --check-prefix=CHECK-ELF %s
+
+
+////////////////////////////////////////////////////////////////////////////////
+// TLS initial-exec forms
+////////////////////////////////////////////////////////////////////////////////
+
+        movz x15, #:gottprel_g1:var
+// CHECK: movz    x15, #:gottprel_g1:var  // encoding: [0bAAA01111,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g1:var, kind: fixup_arm64_movw
+
+// CHECK-ELF:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_MOVW_GOTTPREL_G1 [[VARSYM:[^ ]+]]
+
+
+        movk x13, #:gottprel_g0_nc:var
+// CHECK: movk    x13, #:gottprel_g0_nc:var // encoding: [0bAAA01101,A,0b100AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_g0_nc:var, kind: fixup_arm64_movw
+
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC [[VARSYM]]
+
+        adrp x11, :gottprel:var
+        ldr x10, [x0, #:gottprel_lo12:var]
+        ldr x9, :gottprel:var
+// CHECK: adrp    x11, :gottprel:var      // encoding: [0x0b'A',A,A,0x90'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_arm64_pcrel_adrp_imm21
+// CHECK: ldr     x10, [x0, :gottprel_lo12:var] // encoding: [0x0a,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK: ldr     x9, :gottprel:var       // encoding: [0bAAA01001,A,A,0x58]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :gottprel:var, kind: fixup_arm64_pcrel_imm19
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSIE_LD_GOTTPREL_PREL19 [[VARSYM]]
+
+
+////////////////////////////////////////////////////////////////////////////////
+// TLS local-exec forms
+////////////////////////////////////////////////////////////////////////////////
+
+        movz x3, #:tprel_g2:var
+        movn x4, #:tprel_g2:var
+// CHECK: movz    x3, #:tprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_arm64_movw
+// CHECK: movn    x4, #:tprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g2:var, kind: fixup_arm64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G2 [[VARSYM]]
+
+
+        movz x5, #:tprel_g1:var
+        movn x6, #:tprel_g1:var
+        movz w7, #:tprel_g1:var
+// CHECK: movz    x5, #:tprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
+// CHECK: movn    x6, #:tprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
+// CHECK: movz    w7, #:tprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1:var, kind: fixup_arm64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1 [[VARSYM]]
+
+
+        movk x9, #:tprel_g1_nc:var
+        movk w10, #:tprel_g1_nc:var
+// CHECK: movk    x9, #:tprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    w10, #:tprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g1_nc:var, kind: fixup_arm64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G1_NC [[VARSYM]]
+
+
+        movz x11, #:tprel_g0:var
+        movn x12, #:tprel_g0:var
+        movz w13, #:tprel_g0:var
+// CHECK: movz    x11, #:tprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
+// CHECK: movn    x12, #:tprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
+// CHECK: movz    w13, #:tprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0:var, kind: fixup_arm64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0 [[VARSYM]]
+
+
+        movk x15, #:tprel_g0_nc:var
+        movk w16, #:tprel_g0_nc:var
+// CHECK: movk    x15, #:tprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    w16, #:tprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_g0_nc:var, kind: fixup_arm64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_MOVW_TPREL_G0_NC [[VARSYM]]
+
+
+        add x21, x22, #:tprel_lo12:var
+// CHECK: add     x21, x22, :tprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_add_imm12
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_ADD_TPREL_LO12 [[VARSYM]]
+
+
+        add x25, x26, #:tprel_lo12_nc:var
+// CHECK: add     x25, x26, :tprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_add_imm12
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_ADD_TPREL_LO12_NC [[VARSYM]]
+
+
+        ldrb w29, [x30, #:tprel_lo12:var]
+        ldrsb x29, [x28, #:tprel_lo12_nc:var]
+// CHECK: ldrb    w29, [lr, :tprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale1
+// CHECK: ldrsb   fp, [x28, :tprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale1
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST8_TPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC [[VARSYM]]
+
+
+        strh w27, [x26, #:tprel_lo12:var]
+        ldrsh x25, [x24, #:tprel_lo12_nc:var]
+// CHECK: strh    w27, [x26, :tprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale2
+// CHECK: ldrsh   x25, [x24, :tprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale2
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST16_TPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC [[VARSYM]]
+
+
+        ldr w23, [x22, #:tprel_lo12:var]
+        ldrsw x21, [x20, #:tprel_lo12_nc:var]
+// CHECK: ldr     w23, [x22, :tprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale4
+// CHECK: ldrsw   x21, [x20, :tprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale4
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST32_TPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC [[VARSYM]]
+
+        ldr x19, [x18, #:tprel_lo12:var]
+        str x17, [x16, #:tprel_lo12_nc:var]
+// CHECK: ldr     x19, [x18, :tprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK: str     x17, [x16, :tprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale8
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST64_TPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC [[VARSYM]]
+
+
+////////////////////////////////////////////////////////////////////////////////
+// TLS local-dynamic forms
+////////////////////////////////////////////////////////////////////////////////
+
+        movz x3, #:dtprel_g2:var
+        movn x4, #:dtprel_g2:var
+// CHECK: movz    x3, #:dtprel_g2:var      // encoding: [0bAAA00011,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
+// CHECK: movn    x4, #:dtprel_g2:var      // encoding: [0bAAA00100,A,0b110AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g2:var, kind: fixup_arm64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G2 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G2 [[VARSYM]]
+
+
+        movz x5, #:dtprel_g1:var
+        movn x6, #:dtprel_g1:var
+        movz w7, #:dtprel_g1:var
+// CHECK: movz    x5, #:dtprel_g1:var      // encoding: [0bAAA00101,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
+// CHECK: movn    x6, #:dtprel_g1:var      // encoding: [0bAAA00110,A,0b101AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
+// CHECK: movz    w7, #:dtprel_g1:var      // encoding: [0bAAA00111,A,0b101AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1:var, kind: fixup_arm64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1 [[VARSYM]]
+
+
+        movk x9, #:dtprel_g1_nc:var
+        movk w10, #:dtprel_g1_nc:var
+// CHECK: movk    x9, #:dtprel_g1_nc:var   // encoding: [0bAAA01001,A,0b101AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    w10, #:dtprel_g1_nc:var  // encoding: [0bAAA01010,A,0b101AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g1_nc:var, kind: fixup_arm64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC [[VARSYM]]
+
+
+        movz x11, #:dtprel_g0:var
+        movn x12, #:dtprel_g0:var
+        movz w13, #:dtprel_g0:var
+// CHECK: movz    x11, #:dtprel_g0:var     // encoding: [0bAAA01011,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
+// CHECK: movn    x12, #:dtprel_g0:var     // encoding: [0bAAA01100,A,0b100AAAAA,0x92]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
+// CHECK: movz    w13, #:dtprel_g0:var     // encoding: [0bAAA01101,A,0b100AAAAA,0x12]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0:var, kind: fixup_arm64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0 [[VARSYM]]
+
+
+        movk x15, #:dtprel_g0_nc:var
+        movk w16, #:dtprel_g0_nc:var
+// CHECK: movk    x15, #:dtprel_g0_nc:var  // encoding: [0bAAA01111,A,0b100AAAAA,0xf2]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_arm64_movw
+// CHECK: movk    w16, #:dtprel_g0_nc:var  // encoding: [0bAAA10000,A,0b100AAAAA,0x72]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_g0_nc:var, kind: fixup_arm64_movw
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC [[VARSYM]]
+
+
+        add x21, x22, #:dtprel_lo12:var
+// CHECK: add     x21, x22, :dtprel_lo12:var // encoding: [0xd5,0bAAAAAA10,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_add_imm12
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_ADD_DTPREL_LO12 [[VARSYM]]
+
+
+        add x25, x26, #:dtprel_lo12_nc:var
+// CHECK: add     x25, x26, :dtprel_lo12_nc:var // encoding: [0x59,0bAAAAAA11,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_add_imm12
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC [[VARSYM]]
+
+
+        ldrb w29, [x30, #:dtprel_lo12:var]
+        ldrsb x29, [x28, #:dtprel_lo12_nc:var]
+// CHECK: ldrb    w29, [lr, :dtprel_lo12:var] // encoding: [0xdd,0bAAAAAA11,0b01AAAAAA,0x39]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale1
+// CHECK: ldrsb   fp, [x28, :dtprel_lo12_nc:var] // encoding: [0x9d,0bAAAAAA11,0b10AAAAAA,0x39]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale1
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST8_DTPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC [[VARSYM]]
+
+
+        strh w27, [x26, #:dtprel_lo12:var]
+        ldrsh x25, [x24, #:dtprel_lo12_nc:var]
+// CHECK: strh    w27, [x26, :dtprel_lo12:var] // encoding: [0x5b,0bAAAAAA11,0b00AAAAAA,0x79]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale2
+// CHECK: ldrsh   x25, [x24, :dtprel_lo12_nc:var] // encoding: [0x19,0bAAAAAA11,0b10AAAAAA,0x79]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale2
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST16_DTPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC [[VARSYM]]
+
+
+        ldr w23, [x22, #:dtprel_lo12:var]
+        ldrsw x21, [x20, #:dtprel_lo12_nc:var]
+// CHECK: ldr     w23, [x22, :dtprel_lo12:var] // encoding: [0xd7,0bAAAAAA10,0b01AAAAAA,0xb9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale4
+// CHECK: ldrsw   x21, [x20, :dtprel_lo12_nc:var] // encoding: [0x95,0bAAAAAA10,0b10AAAAAA,0xb9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale4
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST32_DTPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC [[VARSYM]]
+
+        ldr x19, [x18, #:dtprel_lo12:var]
+        str x17, [x16, #:dtprel_lo12_nc:var]
+// CHECK: ldr     x19, [x18, :dtprel_lo12:var] // encoding: [0x53,0bAAAAAA10,0b01AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK: str     x17, [x16, :dtprel_lo12_nc:var] // encoding: [0x11,0bAAAAAA10,0b00AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :dtprel_lo12_nc:var, kind: fixup_arm64_ldst_imm12_scale8
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST64_DTPREL_LO12 [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC [[VARSYM]]
+
+////////////////////////////////////////////////////////////////////////////////
+// TLS descriptor forms
+////////////////////////////////////////////////////////////////////////////////
+
+        adrp x8, :tlsdesc:var
+        ldr x7, [x6, #:tlsdesc_lo12:var]
+        add x5, x4, #:tlsdesc_lo12:var
+        .tlsdesccall var
+        blr x3
+
+// CHECK: adrp    x8, :tlsdesc:var        // encoding: [0x08'A',A,A,0x90'A']
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc:var, kind: fixup_arm64_pcrel_adrp_imm21
+// CHECK: ldr     x7, [x6, :tlsdesc_lo12:var] // encoding: [0xc7,0bAAAAAA00,0b01AAAAAA,0xf9]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_arm64_ldst_imm12_scale8
+// CHECK: add     x5, x4, :tlsdesc_lo12:var // encoding: [0x85,0bAAAAAA00,0b00AAAAAA,0x91]
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: :tlsdesc_lo12:var, kind: fixup_arm64_add_imm12
+// CHECK: .tlsdesccall var                // encoding: []
+// CHECK-NEXT:                                 //   fixup A - offset: 0, value: var, kind: fixup_arm64_tlsdesc_call
+// CHECK: blr     x3                      // encoding: [0x60,0x00,0x3f,0xd6]
+
+
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_ADR_PAGE [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_LD64_LO12_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_ADD_LO12_NC [[VARSYM]]
+// CHECK-ELF-NEXT:     {{0x[0-9A-F]+}} R_AARCH64_TLSDESC_CALL [[VARSYM]]
+
+        // Make sure symbol 5 has type STT_TLS:
+
+// CHECK-ELF:      Symbols [
+// CHECK-ELF:        Symbol {
+// CHECK-ELF:          Name: var (6)
+// CHECK-ELF-NEXT:     Value:
+// CHECK-ELF-NEXT:     Size:
+// CHECK-ELF-NEXT:     Binding: Global
+// CHECK-ELF-NEXT:     Type: TLS
diff --git a/test/MC/ARM64/variable-exprs.s b/test/MC/ARM64/variable-exprs.s
new file mode 100644
index 0000000000..01204425c7
--- /dev/null
+++ b/test/MC/ARM64/variable-exprs.s
@@ -0,0 +1,40 @@
+// RUN: llvm-mc -triple arm64-apple-darwin10 %s -filetype=obj -o %t.o
+
+.data
+
+        .long 0
+a:
+        .long 0
+b = a
+
+c:      .long b
+
+d2 = d
+.globl d2
+d3 = d + 4
+.globl d3
+
+e = a + 4
+
+g:
+f = g
+        .long 0
+
+        .long b
+        .long e
+        .long a + 4
+        .long d
+        .long d2
+        .long d3
+        .long f
+        .long g
+
+///
+        .text
+t0:
+Lt0_a:
+        .long 0
+
+	.section	__DWARF,__debug_frame,regular,debug
+Lt1 = Lt0_a
+	.long	Lt1
diff --git a/test/MC/Disassembler/ARM64/advsimd.txt b/test/MC/Disassembler/ARM64/advsimd.txt
new file mode 100644
index 0000000000..486dd16e10
--- /dev/null
+++ b/test/MC/Disassembler/ARM64/advsimd.txt
@@ -0,0 +1,2282 @@
+# RUN: llvm-mc -triple arm64-apple-darwin -output-asm-variant=1 --disassemble < %s | FileCheck %s
+
+0x00 0xb8 0x20 0x0e
+0x00 0xb8 0x20 0x4e
+0x00 0xb8 0x60 0x0e
+0x00 0xb8 0x60 0x4e
+0x00 0xb8 0xa0 0x0e
+0x00 0xb8 0xa0 0x4e
+
+# CHECK: abs.8b  v0, v0
+# CHECK: abs.16b v0, v0
+# CHECK: abs.4h  v0, v0
+# CHECK: abs.8h  v0, v0
+# CHECK: abs.2s  v0, v0
+# CHECK: abs.4s  v0, v0
+
+0x00 0x84 0x20 0x0e
+0x00 0x84 0x20 0x4e
+0x00 0x84 0x60 0x0e
+0x00 0x84 0x60 0x4e
+0x00 0x84 0xa0 0x0e
+0x00 0x84 0xa0 0x4e
+0x00 0x84 0xe0 0x4e
+
+# CHECK: add.8b  v0, v0, v0
+# CHECK: add.16b v0, v0, v0
+# CHECK: add.4h  v0, v0, v0
+# CHECK: add.8h  v0, v0, v0
+# CHECK: add.2s  v0, v0, v0
+# CHECK: add.4s  v0, v0, v0
+# CHECK: add.2d  v0, v0, v0
+
+0x41 0x84 0xe3 0x5e
+
+# CHECK: add d1, d2, d3
+
+0x00 0x40 0x20 0x0e
+0x00 0x40 0x20 0x4e
+0x00 0x40 0x60 0x0e
+0x00 0x40 0x60 0x4e
+0x00 0x40 0xa0 0x0e
+0x00 0x40 0xa0 0x4e
+
+# CHECK: addhn.8b   v0, v0, v0
+# CHECK: addhn2.16b v0, v0, v0
+# CHECK: addhn.4h   v0, v0, v0
+# CHECK: addhn2.8h  v0, v0, v0
+# CHECK: addhn.2s   v0, v0, v0
+# CHECK: addhn2.4s  v0, v0, v0
+
+0x00 0xbc 0x20 0x0e
+0x00 0xbc 0x20 0x4e
+0x00 0xbc 0x60 0x0e
+0x00 0xbc 0x60 0x4e
+0x00 0xbc 0xa0 0x0e
+0x00 0xbc 0xa0 0x4e
+0x00 0xbc 0xe0 0x4e
+
+# CHECK: addp.8b   v0, v0, v0
+# CHECK: addp.16b  v0, v0, v0
+# CHECK: addp.4h   v0, v0, v0
+# CHECK: addp.8h   v0, v0, v0
+# CHECK: addp.2s   v0, v0, v0
+# CHECK: addp.4s   v0, v0, v0
+# CHECK: addp.2d   v0, v0, v0
+
+0x00 0xb8 0xf1 0x5e
+
+# CHECK: addp.2d d0, v0
+
+0x00 0xb8 0x31 0x0e
+0x00 0xb8 0x31 0x4e
+0x00 0xb8 0x71 0x0e
+0x00 0xb8 0x71 0x4e
+0x00 0xb8 0xb1 0x4e
+
+# CHECK: addv.8b  b0, v0
+# CHECK: addv.16b b0, v0
+# CHECK: addv.4h  h0, v0
+# CHECK: addv.8h  h0, v0
+# CHECK: addv.4s  s0, v0
+
+
+# INS/DUP
+0x60 0x0c 0x08 0x4e
+0x60 0x0c 0x04 0x4e
+0x60 0x0c 0x04 0x0e
+0x60 0x0c 0x02 0x4e
+0x60 0x0c 0x02 0x0e
+0x60 0x0c 0x01 0x4e
+0x60 0x0c 0x01 0x0e
+
+# CHECK: dup.2d  v0, x3
+# CHECK: dup.4s  v0, w3
+# CHECK: dup.2s  v0, w3
+# CHECK: dup.8h  v0, w3
+# CHECK: dup.4h  v0, w3
+# CHECK: dup.16b v0, w3
+# CHECK: dup.8b  v0, w3
+
+0x60 0x04 0x18 0x4e
+0x60 0x04 0x0c 0x0e
+0x60 0x04 0x0c 0x4e
+0x60 0x04 0x06 0x0e
+0x60 0x04 0x06 0x4e
+0x60 0x04 0x03 0x0e
+0x60 0x04 0x03 0x4e
+
+# CHECK: dup.2d  v0, v3[1]
+# CHECK: dup.2s  v0, v3[1]
+# CHECK: dup.4s  v0, v3[1]
+# CHECK: dup.4h  v0, v3[1]
+# CHECK: dup.8h  v0, v3[1]
+# CHECK: dup.8b  v0, v3[1]
+# CHECK: dup.16b v0, v3[1]
+
+
+0x43 0x2c 0x14 0x4e
+0x43 0x2c 0x14 0x4e
+0x43 0x3c 0x14 0x0e
+0x43 0x3c 0x14 0x0e
+0x43 0x3c 0x18 0x4e
+0x43 0x3c 0x18 0x4e
+
+# CHECK: smov.s  x3, v2[2]
+# CHECK: smov.s  x3, v2[2]
+# CHECK: umov.s  w3, v2[2]
+# CHECK: umov.s  w3, v2[2]
+# CHECK: umov.d  x3, v2[1]
+# CHECK: umov.d  x3, v2[1]
+
+0xa2 0x1c 0x18 0x4e
+0xa2 0x1c 0x0c 0x4e
+0xa2 0x1c 0x06 0x4e
+0xa2 0x1c 0x03 0x4e
+
+0xa2 0x1c 0x18 0x4e
+0xa2 0x1c 0x0c 0x4e
+0xa2 0x1c 0x06 0x4e
+0xa2 0x1c 0x03 0x4e
+
+# CHECK: ins.d v2[1], x5
+# CHECK: ins.s v2[1], w5
+# CHECK: ins.h v2[1], w5
+# CHECK: ins.b v2[1], w5
+
+# CHECK: ins.d v2[1], x5
+# CHECK: ins.s v2[1], w5
+# CHECK: ins.h v2[1], w5
+# CHECK: ins.b v2[1], w5
+
+0xe2 0x45 0x18 0x6e
+0xe2 0x25 0x0c 0x6e
+0xe2 0x15 0x06 0x6e
+0xe2 0x0d 0x03 0x6e
+
+0xe2 0x05 0x18 0x6e
+0xe2 0x45 0x1c 0x6e
+0xe2 0x35 0x1e 0x6e
+0xe2 0x2d 0x15 0x6e
+
+# CHECK: ins.d v2[1], v15[1]
+# CHECK: ins.s v2[1], v15[1]
+# CHECK: ins.h v2[1], v15[1]
+# CHECK: ins.b v2[1], v15[1]
+
+# CHECK: ins.d v2[1], v15[0]
+# CHECK: ins.s v2[3], v15[2]
+# CHECK: ins.h v2[7], v15[3]
+# CHECK: ins.b v2[10], v15[5]
+
+0x00 0x1c 0x20 0x0e
+0x00 0x1c 0x20 0x4e
+
+# CHECK: and.8b  v0, v0, v0
+# CHECK: and.16b v0, v0, v0
+
+0x00 0x1c 0x60 0x0e
+
+# CHECK: bic.8b  v0, v0, v0
+
+0x00 0x8c 0x20 0x2e
+0x00 0x3c 0x20 0x0e
+0x00 0x34 0x20 0x0e
+0x00 0x34 0x20 0x2e
+0x00 0x3c 0x20 0x2e
+0x00 0x8c 0x20 0x0e
+0x00 0xd4 0xa0 0x2e
+0x00 0xec 0x20 0x2e
+0x00 0xec 0xa0 0x2e
+0x00 0xd4 0x20 0x2e
+0x00 0xd4 0x20 0x0e
+0x00 0xe4 0x20 0x0e
+0x00 0xe4 0x20 0x2e
+0x00 0xe4 0xa0 0x2e
+0x00 0xfc 0x20 0x2e
+0x00 0xc4 0x20 0x2e
+0x00 0xc4 0x20 0x0e
+0x00 0xf4 0x20 0x2e
+0x00 0xf4 0x20 0x0e
+0x00 0xc4 0xa0 0x2e
+0x00 0xc4 0xa0 0x0e
+0x00 0xf4 0xa0 0x2e
+0x00 0xf4 0xa0 0x0e
+0x00 0xcc 0x20 0x0e
+0x00 0xcc 0xa0 0x0e
+0x00 0xdc 0x20 0x0e
+0x00 0xdc 0x20 0x2e
+0x00 0xfc 0x20 0x0e
+0x00 0xfc 0xa0 0x0e
+0x00 0xd4 0xa0 0x0e
+0x00 0x94 0x20 0x0e
+0x00 0x94 0x20 0x2e
+0x00 0x9c 0x20 0x0e
+0x00 0x9c 0x20 0x2e
+0x00 0x7c 0x20 0x0e
+0x00 0x74 0x20 0x0e
+0x00 0x04 0x20 0x0e
+0x00 0x24 0x20 0x0e
+0x00 0xa4 0x20 0x0e
+0x00 0x64 0x20 0x0e
+0x00 0xac 0x20 0x0e
+0x00 0x6c 0x20 0x0e
+0x00 0x0c 0x20 0x0e
+0x00 0xb4 0x60 0x0e
+0x00 0xb4 0x60 0x2e
+0x00 0x5c 0x20 0x0e
+0x00 0x4c 0x20 0x0e
+0x00 0x2c 0x20 0x0e
+0x00 0x14 0x20 0x0e
+0x00 0x54 0x20 0x0e
+0x00 0x44 0x20 0x0e
+0x00 0x84 0x20 0x2e
+0x00 0x7c 0x20 0x2e
+0x00 0x74 0x20 0x2e
+0x00 0x04 0x20 0x2e
+0x00 0x24 0x20 0x2e
+0x00 0xa4 0x20 0x2e
+0x00 0x64 0x20 0x2e
+0x00 0xac 0x20 0x2e
+0x00 0x6c 0x20 0x2e
+0x00 0x0c 0x20 0x2e
+0x00 0x5c 0x20 0x2e
+0x00 0x4c 0x20 0x2e
+0x00 0x2c 0x20 0x2e
+0x00 0x14 0x20 0x2e
+0x00 0x54 0x20 0x2e
+0x00 0x44 0x20 0x2e
+
+# CHECK: cmeq.8b	v0, v0, v0
+# CHECK: cmge.8b	v0, v0, v0
+# CHECK: cmgt.8b	v0, v0, v0
+# CHECK: cmhi.8b	v0, v0, v0
+# CHECK: cmhs.8b	v0, v0, v0
+# CHECK: cmtst.8b	v0, v0, v0
+# CHECK: fabd.2s	v0, v0, v0
+# CHECK: facge.2s	v0, v0, v0
+# CHECK: facgt.2s	v0, v0, v0
+# CHECK: faddp.2s	v0, v0, v0
+# CHECK: fadd.2s	v0, v0, v0
+# CHECK: fcmeq.2s	v0, v0, v0
+# CHECK: fcmge.2s	v0, v0, v0
+# CHECK: fcmgt.2s	v0, v0, v0
+# CHECK: fdiv.2s	v0, v0, v0
+# CHECK: fmaxnmp.2s	v0, v0, v0
+# CHECK: fmaxnm.2s	v0, v0, v0
+# CHECK: fmaxp.2s	v0, v0, v0
+# CHECK: fmax.2s	v0, v0, v0
+# CHECK: fminnmp.2s	v0, v0, v0
+# CHECK: fminnm.2s	v0, v0, v0
+# CHECK: fminp.2s	v0, v0, v0
+# CHECK: fmin.2s	v0, v0, v0
+# CHECK: fmla.2s	v0, v0, v0
+# CHECK: fmls.2s	v0, v0, v0
+# CHECK: fmulx.2s	v0, v0, v0
+# CHECK: fmul.2s	v0, v0, v0
+# CHECK: frecps.2s	v0, v0, v0
+# CHECK: frsqrts.2s	v0, v0, v0
+# CHECK: fsub.2s	v0, v0, v0
+# CHECK: mla.8b	v0, v0, v0
+# CHECK: mls.8b	v0, v0, v0
+# CHECK: mul.8b	v0, v0, v0
+# CHECK: pmul.8b	v0, v0, v0
+# CHECK: saba.8b	v0, v0, v0
+# CHECK: sabd.8b	v0, v0, v0
+# CHECK: shadd.8b	v0, v0, v0
+# CHECK: shsub.8b	v0, v0, v0
+# CHECK: smaxp.8b	v0, v0, v0
+# CHECK: smax.8b	v0, v0, v0
+# CHECK: sminp.8b	v0, v0, v0
+# CHECK: smin.8b	v0, v0, v0
+# CHECK: sqadd.8b	v0, v0, v0
+# CHECK: sqdmulh.4h v0, v0, v0
+# CHECK: sqrdmulh.4h v0, v0, v0
+# CHECK: sqrshl.8b	v0, v0, v0
+# CHECK: sqshl.8b	v0, v0, v0
+# CHECK: sqsub.8b	v0, v0, v0
+# CHECK: srhadd.8b	v0, v0, v0
+# CHECK: srshl.8b	v0, v0, v0
+# CHECK: sshl.8b	v0, v0, v0
+# CHECK: sub.8b	v0, v0, v0
+# CHECK: uaba.8b	v0, v0, v0
+# CHECK: uabd.8b	v0, v0, v0
+# CHECK: uhadd.8b	v0, v0, v0
+# CHECK: uhsub.8b	v0, v0, v0
+# CHECK: umaxp.8b	v0, v0, v0
+# CHECK: umax.8b	v0, v0, v0
+# CHECK: uminp.8b	v0, v0, v0
+# CHECK: umin.8b	v0, v0, v0
+# CHECK: uqadd.8b	v0, v0, v0
+# CHECK: uqrshl.8b	v0, v0, v0
+# CHECK: uqshl.8b	v0, v0, v0
+# CHECK: uqsub.8b	v0, v0, v0
+# CHECK: urhadd.8b	v0, v0, v0
+# CHECK: urshl.8b	v0, v0, v0
+# CHECK: ushl.8b	v0, v0, v0
+
+0x00 0x1c 0xe0 0x2e
+0x00 0x1c 0xa0 0x2e
+0x00 0x1c 0x60 0x2e
+0x00 0x1c 0x20 0x2e
+0x00 0x1c 0xe0 0x0e
+0x00 0x1c 0xa0 0x0e
+
+# CHECK: bif.8b	v0, v0, v0
+# CHECK: bit.8b	v0, v0, v0
+# CHECK: bsl.8b	v0, v0, v0
+# CHECK: eor.8b	v0, v0, v0
+# CHECK: orn.8b	v0, v0, v0
+# CHECK: orr.8b	v0, v0, v0
+
+0x00 0x68 0x20 0x0e
+0x00 0x68 0x20 0x4e
+0x00 0x68 0x60 0x0e
+0x00 0x68 0x60 0x4e
+0x00 0x68 0xa0 0x0e
+0x00 0x68 0xa0 0x4e
+
+# CHECK: sadalp.4h	v0, v0
+# CHECK: sadalp.8h	v0, v0
+# CHECK: sadalp.2s	v0, v0
+# CHECK: sadalp.4s	v0, v0
+# CHECK: sadalp.1d	v0, v0
+# CHECK: sadalp.2d	v0, v0
+
+0x00 0x48 0x20 0x0e
+0x00 0x48 0x20 0x2e
+0x00 0x58 0x20 0x0e
+0x00 0xf8 0xa0 0x0e
+0x00 0xc8 0x21 0x0e
+0x00 0xc8 0x21 0x2e
+0x00 0xb8 0x21 0x0e
+0x00 0xb8 0x21 0x2e
+0x00 0xa8 0x21 0x0e
+0x00 0xa8 0x21 0x2e
+0x00 0xa8 0xa1 0x0e
+0x00 0xa8 0xa1 0x2e
+0x00 0xb8 0xa1 0x0e
+0x00 0xb8 0xa1 0x2e
+0x00 0xf8 0xa0 0x2e
+0x00 0xd8 0xa1 0x0e
+0x00 0xd8 0xa1 0x2e
+0x00 0xf8 0xa1 0x2e
+0x00 0xb8 0x20 0x2e
+0x00 0x58 0x20 0x2e
+0x00 0x58 0x60 0x2e
+0x00 0x18 0x20 0x0e
+0x00 0x08 0x20 0x2e
+0x00 0x08 0x20 0x0e
+0x00 0x68 0x20 0x0e
+0x00 0x28 0x20 0x0e
+0x00 0xd8 0x21 0x0e
+0x00 0x38 0x21 0x2e
+0x00 0x78 0x20 0x0e
+0x00 0x78 0x20 0x2e
+0x00 0x48 0x21 0x0e
+0x00 0x28 0x21 0x2e
+0x00 0x38 0x20 0x0e
+0x00 0x68 0x20 0x2e
+0x00 0x28 0x20 0x2e
+0x00 0xd8 0x21 0x2e
+0x00 0x48 0x21 0x2e
+0x00 0xc8 0xa1 0x0e
+0x00 0xc8 0xa1 0x2e
+0x00 0x38 0x20 0x2e
+0x00 0x28 0x21 0x0e
+0x00 0x48 0x20 0x0e
+0x00 0x48 0x20 0x2e
+0x00 0x58 0x20 0x0e
+0x00 0xf8 0xa0 0x0e
+0x00 0xc8 0x21 0x0e
+0x00 0xc8 0x21 0x2e
+0x00 0xb8 0x21 0x0e
+0x00 0xb8 0x21 0x2e
+0x00 0xa8 0x21 0x0e
+0x00 0xa8 0x21 0x2e
+0x00 0xa8 0xa1 0x0e
+0x00 0xa8 0xa1 0x2e
+0x00 0xb8 0xa1 0x0e
+0x00 0xb8 0xa1 0x2e
+0x00 0xf8 0xa0 0x2e
+0x00 0xd8 0xa1 0x0e
+0x00 0xd8 0xa1 0x2e
+0x00 0xf8 0xa1 0x2e
+0x00 0xb8 0x20 0x2e
+0x00 0x58 0x20 0x2e
+0x00 0x58 0x60 0x2e
+0x00 0x18 0x20 0x0e
+0x00 0x08 0x20 0x2e
+0x00 0x08 0x20 0x0e
+0x00 0x68 0x20 0x0e
+0x00 0x28 0x20 0x0e
+0x00 0xd8 0x21 0x0e
+0x00 0x38 0x21 0x2e
+0x00 0x78 0x20 0x0e
+0x00 0x78 0x20 0x2e
+0x00 0x48 0x21 0x0e
+0x00 0x28 0x21 0x2e
+0x00 0x38 0x20 0x0e
+0x00 0x68 0x20 0x2e
+0x00 0x28 0x20 0x2e
+0x00 0xd8 0x21 0x2e
+0x00 0x48 0x21 0x2e
+0x00 0xc8 0xa1 0x0e
+0x00 0xc8 0xa1 0x2e
+0x00 0x38 0x20 0x2e
+0x00 0x28 0x21 0x0e
+
+# CHECK: cls.8b	v0, v0
+# CHECK: clz.8b	v0, v0
+# CHECK: cnt.8b	v0, v0
+# CHECK: fabs.2s	v0, v0
+# CHECK: fcvtas.2s	v0, v0
+# CHECK: fcvtau.2s	v0, v0
+# CHECK: fcvtms.2s	v0, v0
+# CHECK: fcvtmu.2s	v0, v0
+# CHECK: fcvtns.2s	v0, v0
+# CHECK: fcvtnu.2s	v0, v0
+# CHECK: fcvtps.2s	v0, v0
+# CHECK: fcvtpu.2s	v0, v0
+# CHECK: fcvtzs.2s	v0, v0
+# CHECK: fcvtzu.2s	v0, v0
+# CHECK: fneg.2s	v0, v0
+# CHECK: frecpe.2s	v0, v0
+# CHECK: frsqrte.2s	v0, v0
+# CHECK: fsqrt.2s	v0, v0
+# CHECK: neg.8b	v0, v0
+# CHECK: not.8b	v0, v0
+# CHECK: rbit.8b	v0, v0
+# CHECK: rev16.8b	v0, v0
+# CHECK: rev32.8b	v0, v0
+# CHECK: rev64.8b	v0, v0
+# CHECK: sadalp.4h	v0, v0
+# CHECK: saddlp.4h	v0, v0
+# CHECK: scvtf.2s	v0, v0
+# CHECK: shll.8h	v0, v0, #8
+# CHECK: sqabs.8b	v0, v0
+# CHECK: sqneg.8b	v0, v0
+# CHECK: sqxtn.8b	v0, v0
+# CHECK: sqxtun.8b	v0, v0
+# CHECK: suqadd.8b	v0, v0
+# CHECK: uadalp.4h	v0, v0
+# CHECK: uaddlp.4h	v0, v0
+# CHECK: ucvtf.2s	v0, v0
+# CHECK: uqxtn.8b	v0, v0
+# CHECK: urecpe.2s	v0, v0
+# CHECK: ursqrte.2s	v0, v0
+# CHECK: usqadd.8b	v0, v0
+# CHECK: xtn.8b	v0, v0
+
+0x00 0x98 0x20 0x0e
+0x00 0x98 0x20 0x4e
+0x00 0x98 0x60 0x0e
+0x00 0x98 0x60 0x4e
+0x00 0x98 0xa0 0x0e
+0x00 0x98 0xa0 0x4e
+0x00 0x98 0xe0 0x4e
+
+# CHECK: cmeq.8b	v0, v0, #0
+# CHECK: cmeq.16b	v0, v0, #0
+# CHECK: cmeq.4h	v0, v0, #0
+# CHECK: cmeq.8h	v0, v0, #0
+# CHECK: cmeq.2s	v0, v0, #0
+# CHECK: cmeq.4s	v0, v0, #0
+# CHECK: cmeq.2d	v0, v0, #0
+
+0x00 0x88 0x20 0x2e
+0x00 0x88 0x20 0x0e
+0x00 0x98 0x20 0x2e
+0x00 0xa8 0x20 0x0e
+0x00 0xd8 0xa0 0x0e
+0x00 0xc8 0xa0 0x2e
+0x00 0xc8 0xa0 0x0e
+0x00 0xd8 0xa0 0x2e
+0x00 0xe8 0xa0 0x0e
+
+# CHECK: cmge.8b	v0, v0, #0
+# CHECK: cmgt.8b	v0, v0, #0
+# CHECK: cmle.8b	v0, v0, #0
+# CHECK: cmlt.8b	v0, v0, #0
+# CHECK: fcmeq.2s	v0, v0, #0
+# CHECK: fcmge.2s	v0, v0, #0
+# CHECK: fcmgt.2s	v0, v0, #0
+# CHECK: fcmle.2s	v0, v0, #0
+# CHECK: fcmlt.2s	v0, v0, #0
+
+0x00 0x78 0x21 0x0e
+0x00 0x78 0x21 0x4e
+0x00 0x78 0x61 0x0e
+0x00 0x78 0x61 0x4e
+0x00 0x68 0x21 0x0e
+0x00 0x68 0x21 0x4e
+0x00 0x68 0x61 0x0e
+0x00 0x68 0x61 0x4e
+0x00 0x68 0x61 0x2e
+0x00 0x68 0x61 0x6e
+
+# CHECK: fcvtl	v0.4s, v0.4h
+# CHECK: fcvtl2	v0.4s, v0.8h
+# CHECK: fcvtl	v0.2d, v0.2s
+# CHECK: fcvtl2	v0.2d, v0.4s
+# CHECK: fcvtn	v0.4h, v0.4s
+# CHECK: fcvtn2	v0.8h, v0.4s
+# CHECK: fcvtn	v0.2s, v0.2d
+# CHECK: fcvtn2	v0.4s, v0.2d
+# CHECK: fcvtxn	v0.2s, v0.2d
+# CHECK: fcvtxn2	v0.4s, v0.2d
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD modified immediate instructions
+#===-------------------------------------------------------------------------===
+
+0x20 0x14 0x00 0x2f
+0x20 0x34 0x00 0x2f
+0x20 0x54 0x00 0x2f
+0x20 0x74 0x00 0x2f
+
+# CHECK: bic.2s v0, #1
+# CHECK: bic.2s v0, #1, lsl #8
+# CHECK: bic.2s v0, #1, lsl #16
+# CHECK: bic.2s v0, #1, lsl #24
+
+0x20 0x94 0x00 0x2f
+0x20 0x94 0x00 0x2f
+0x20 0xb4 0x00 0x2f
+
+# CHECK: bic.4h v0, #1
+# CHECK: bic.4h v0, #1
+# FIXME: bic.4h v0, #1, lsl #8
+#    'bic.4h' should be selected over "fcvtnu.2s v0, v1, #0"
+
+0x20 0x14 0x00 0x6f
+0x20 0x34 0x00 0x6f
+0x20 0x54 0x00 0x6f
+0x20 0x74 0x00 0x6f
+
+# CHECK: bic.4s v0, #1
+# CHECK: bic.4s v0, #1, lsl #8
+# CHECK: bic.4s v0, #1, lsl #16
+# CHECK: bic.4s v0, #1, lsl #24
+
+0x20 0x94 0x00 0x6f
+0x20 0xb4 0x00 0x6f
+
+# CHECK: bic.8h v0, #1
+# FIXME: bic.8h v0, #1, lsl #8
+#    "bic.8h" should be selected over "fcvtnu.4s v0, v1, #0"
+
+0x00 0xf4 0x02 0x6f
+
+# CHECK: fmov.2d v0, #1.250000e-01
+
+0x00 0xf4 0x02 0x0f
+0x00 0xf4 0x02 0x4f
+
+# CHECK: fmov.2s v0, #1.250000e-01
+# CHECK: fmov.4s v0, #1.250000e-01
+
+0x20 0x14 0x00 0x0f
+0x20 0x34 0x00 0x0f
+0x20 0x54 0x00 0x0f
+0x20 0x74 0x00 0x0f
+
+# CHECK: orr.2s v0, #1
+# CHECK: orr.2s v0, #1, lsl #8
+# CHECK: orr.2s v0, #1, lsl #16
+# CHECK: orr.2s v0, #1, lsl #24
+
+0x20 0x94 0x00 0x0f
+0x20 0xb4 0x00 0x0f
+
+# CHECK: orr.4h v0, #1
+# FIXME: orr.4h v0, #1, lsl #8
+#    'orr.4h' should be selected over "fcvtns.2s v0, v1, #0"
+
+0x20 0x14 0x00 0x4f
+0x20 0x34 0x00 0x4f
+0x20 0x54 0x00 0x4f
+0x20 0x74 0x00 0x4f
+
+# CHECK: orr.4s v0, #1
+# CHECK: orr.4s v0, #1, lsl #8
+# CHECK: orr.4s v0, #1, lsl #16
+# CHECK: orr.4s v0, #1, lsl #24
+
+0x20 0x94 0x00 0x4f
+0x20 0xb4 0x00 0x4f
+
+# CHECK: orr.8h v0, #1
+# FIXME: orr.8h v0, #1, lsl #8
+#    "orr.8h" should be selected over "fcvtns.4s v0, v1, #0"
+
+0x21 0x70 0x40 0x0c
+0x42 0xa0 0x40 0x4c
+0x64 0x64 0x40 0x0c
+0x87 0x24 0x40 0x4c
+0x0c 0xa8 0x40 0x0c
+0x0a 0x68 0x40 0x4c
+0x2d 0xac 0x40 0x0c
+0x4f 0x7c 0x40 0x4c
+
+# CHECK: ld1.8b { v1 }, [x1]
+# CHECK: ld1.16b { v2, v3 }, [x2]
+# CHECK: ld1.4h { v4, v5, v6 }, [x3]
+# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4]
+# CHECK: ld1.2s { v12, v13 }, [x0]
+# CHECK: ld1.4s { v10, v11, v12 }, [x0]
+# CHECK: ld1.1d { v13, v14 }, [x1]
+# CHECK: ld1.2d	{ v15 }, [x2]
+
+0x41 0x70 0xdf 0x0c
+0x41 0xa0 0xdf 0x0c
+0x41 0x60 0xdf 0x0c
+0x41 0x20 0xdf 0x0c
+0x42 0x70 0xdf 0x4c
+0x42 0xa0 0xdf 0x4c
+0x42 0x60 0xdf 0x4c
+0x42 0x20 0xdf 0x4c
+0x64 0x74 0xdf 0x0c
+0x64 0xa4 0xdf 0x0c
+0x64 0x64 0xdf 0x0c
+0x64 0x24 0xdf 0x0c
+0x87 0x74 0xdf 0x4c
+0x87 0xa4 0xdf 0x4c
+0x87 0x64 0xdf 0x4c
+0x87 0x24 0xdf 0x4c
+0x0c 0x78 0xdf 0x0c
+0x0c 0xa8 0xdf 0x0c
+0x0c 0x68 0xdf 0x0c
+0x0c 0x28 0xdf 0x0c
+0x0a 0x78 0xdf 0x4c
+0x0a 0xa8 0xdf 0x4c
+0x0a 0x68 0xdf 0x4c
+0x0a 0x28 0xdf 0x4c
+0x2d 0x7c 0xdf 0x0c
+0x2d 0xac 0xdf 0x0c
+0x2d 0x6c 0xdf 0x0c
+0x2d 0x2c 0xdf 0x0c
+0x4f 0x7c 0xdf 0x4c
+0x4f 0xac 0xdf 0x4c
+0x4f 0x6c 0xdf 0x4c
+0x4f 0x2c 0xdf 0x4c
+
+# CHECK: ld1.8b { v1 }, [x2], #8
+# CHECK: ld1.8b { v1, v2 }, [x2], #16
+# CHECK: ld1.8b { v1, v2, v3 }, [x2], #24
+# CHECK: ld1.8b { v1, v2, v3, v4 }, [x2], #32
+# CHECK: ld1.16b { v2 }, [x2], #16
+# CHECK: ld1.16b { v2, v3 }, [x2], #32
+# CHECK: ld1.16b { v2, v3, v4 }, [x2], #48
+# CHECK: ld1.16b { v2, v3, v4, v5 }, [x2], #64
+# CHECK: ld1.4h { v4 }, [x3], #8
+# CHECK: ld1.4h { v4, v5 }, [x3], #16
+# CHECK: ld1.4h { v4, v5, v6 }, [x3], #24
+# CHECK: ld1.4h { v4, v5, v6, v7 }, [x3], #32
+# CHECK: ld1.8h { v7 }, [x4], #16
+# CHECK: ld1.8h { v7, v8 }, [x4], #32
+# CHECK: ld1.8h { v7, v8, v9 }, [x4], #48
+# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4], #64
+# CHECK: ld1.2s { v12 }, [x0], #8
+# CHECK: ld1.2s { v12, v13 }, [x0], #16
+# CHECK: ld1.2s { v12, v13, v14 }, [x0], #24
+# CHECK: ld1.2s { v12, v13, v14, v15 }, [x0], #32
+# CHECK: ld1.4s { v10 }, [x0], #16
+# CHECK: ld1.4s { v10, v11 }, [x0], #32
+# CHECK: ld1.4s { v10, v11, v12 }, [x0], #48
+# CHECK: ld1.4s { v10, v11, v12, v13 }, [x0], #64
+# CHECK: ld1.1d { v13 }, [x1], #8
+# CHECK: ld1.1d { v13, v14 }, [x1], #16
+# CHECK: ld1.1d { v13, v14, v15 }, [x1], #24
+# CHECK: ld1.1d { v13, v14, v15, v16 }, [x1], #32
+# CHECK: ld1.2d { v15 }, [x2], #16
+# CHECK: ld1.2d { v15, v16 }, [x2], #32
+# CHECK: ld1.2d { v15, v16, v17 }, [x2], #48
+# CHECK: ld1.2d { v15, v16, v17, v18 }, [x2], #64
+
+0x21 0x70 0x00 0x0c
+0x42 0xa0 0x00 0x4c
+0x64 0x64 0x00 0x0c
+0x87 0x24 0x00 0x4c
+0x0c 0xa8 0x00 0x0c
+0x0a 0x68 0x00 0x4c
+0x2d 0xac 0x00 0x0c
+0x4f 0x7c 0x00 0x4c
+
+# CHECK: st1.8b { v1 }, [x1]
+# CHECK: st1.16b { v2, v3 }, [x2]
+# CHECK: st1.4h { v4, v5, v6 }, [x3]
+# CHECK: st1.8h { v7, v8, v9, v10 }, [x4]
+# CHECK: st1.2s { v12, v13 }, [x0]
+# CHECK: st1.4s { v10, v11, v12 }, [x0]
+# CHECK: st1.1d { v13, v14 }, [x1]
+# CHECK: st1.2d	{ v15 }, [x2]
+
+0x61 0x08 0x40 0x0d
+0x82 0x84 0x40 0x4d
+0xa3 0x58 0x40 0x0d
+0xc4 0x80 0x40 0x4d
+
+# CHECK: ld1.b { v1 }[2], [x3]
+# CHECK: ld1.d { v2 }[1], [x4]
+# CHECK: ld1.h { v3 }[3], [x5]
+# CHECK: ld1.s { v4 }[2], [x6]
+
+0x61 0x08 0xdf 0x0d
+0x82 0x84 0xdf 0x4d
+0xa3 0x58 0xdf 0x0d
+0xc4 0x80 0xdf 0x4d
+
+# CHECK: ld1.b { v1 }[2], [x3], #1
+# CHECK: ld1.d { v2 }[1], [x4], #8
+# CHECK: ld1.h { v3 }[3], [x5], #2
+# CHECK: ld1.s { v4 }[2], [x6], #4
+
+0x61 0x08 0x00 0x0d
+0x82 0x84 0x00 0x4d
+0xa3 0x58 0x00 0x0d
+0xc4 0x80 0x00 0x4d
+
+# CHECK: st1.b { v1 }[2], [x3]
+# CHECK: st1.d { v2 }[1], [x4]
+# CHECK: st1.h { v3 }[3], [x5]
+# CHECK: st1.s { v4 }[2], [x6]
+
+0x61 0x08 0x9f 0x0d
+0x82 0x84 0x9f 0x4d
+0xa3 0x58 0x9f 0x0d
+0xc4 0x80 0x9f 0x4d
+
+# CHECK: st1.b { v1 }[2], [x3], #1
+# CHECK: st1.d { v2 }[1], [x4], #8
+# CHECK: st1.h { v3 }[3], [x5], #2
+# CHECK: st1.s { v4 }[2], [x6], #4
+
+0x61 0x08 0xc4 0x0d
+0x82 0x84 0xc5 0x4d
+0xa3 0x58 0xc6 0x0d
+0xc4 0x80 0xc7 0x4d
+
+# CHECK: ld1.b { v1 }[2], [x3], x4
+# CHECK: ld1.d { v2 }[1], [x4], x5
+# CHECK: ld1.h { v3 }[3], [x5], x6
+# CHECK: ld1.s { v4 }[2], [x6], x7
+
+0x61 0x08 0x84 0x0d
+0x82 0x84 0x85 0x4d
+0xa3 0x58 0x86 0x0d
+0xc4 0x80 0x87 0x4d
+
+# CHECK: st1.b { v1 }[2], [x3], x4
+# CHECK: st1.d { v2 }[1], [x4], x5
+# CHECK: st1.h { v3 }[3], [x5], x6
+# CHECK: st1.s { v4 }[2], [x6], x7
+
+0x41 0x70 0xc3 0x0c
+0x42 0xa0 0xc4 0x4c
+0x64 0x64 0xc5 0x0c
+0x87 0x24 0xc6 0x4c
+0x0c 0xa8 0xc7 0x0c
+0x0a 0x68 0xc8 0x4c
+0x2d 0xac 0xc9 0x0c
+0x4f 0x7c 0xca 0x4c
+
+# CHECK: ld1.8b { v1 }, [x2], x3
+# CHECK: ld1.16b { v2, v3 }, [x2], x4
+# CHECK: ld1.4h { v4, v5, v6 }, [x3], x5
+# CHECK: ld1.8h { v7, v8, v9, v10 }, [x4], x6
+# CHECK: ld1.2s { v12, v13 }, [x0], x7
+# CHECK: ld1.4s { v10, v11, v12 }, [x0], x8
+# CHECK: ld1.1d { v13, v14 }, [x1], x9
+# CHECK: ld1.2d { v15 }, [x2], x10
+
+0x41 0x70 0x83 0x0c
+0x42 0xa0 0x84 0x4c
+0x64 0x64 0x85 0x0c
+0x87 0x24 0x86 0x4c
+0x0c 0xa8 0x87 0x0c
+0x0a 0x68 0x88 0x4c
+0x2d 0xac 0x89 0x0c
+0x4f 0x7c 0x8a 0x4c
+
+# CHECK: st1.8b { v1 }, [x2], x3
+# CHECK: st1.16b { v2, v3 }, [x2], x4
+# CHECK: st1.4h { v4, v5, v6 }, [x3], x5
+# CHECK: st1.8h { v7, v8, v9, v10 }, [x4], x6
+# CHECK: st1.2s { v12, v13 }, [x0], x7
+# CHECK: st1.4s { v10, v11, v12 }, [x0], x8
+# CHECK: st1.1d { v13, v14 }, [x1], x9
+# CHECK: st1.2d { v15 }, [x2], x10
+
+0x41 0x70 0x9f 0x0c
+0x41 0xa0 0x9f 0x0c
+0x41 0x60 0x9f 0x0c
+0x41 0x20 0x9f 0x0c
+0x42 0x70 0x9f 0x4c
+0x42 0xa0 0x9f 0x4c
+0x42 0x60 0x9f 0x4c
+0x42 0x20 0x9f 0x4c
+0x64 0x74 0x9f 0x0c
+0x64 0xa4 0x9f 0x0c
+0x64 0x64 0x9f 0x0c
+0x64 0x24 0x9f 0x0c
+0x87 0x74 0x9f 0x4c
+0x87 0xa4 0x9f 0x4c
+0x87 0x64 0x9f 0x4c
+0x87 0x24 0x9f 0x4c
+0x0c 0x78 0x9f 0x0c
+0x0c 0xa8 0x9f 0x0c
+0x0c 0x68 0x9f 0x0c
+0x0c 0x28 0x9f 0x0c
+0x0a 0x78 0x9f 0x4c
+0x0a 0xa8 0x9f 0x4c
+0x0a 0x68 0x9f 0x4c
+0x0a 0x28 0x9f 0x4c
+0x2d 0x7c 0x9f 0x0c
+0x2d 0xac 0x9f 0x0c
+0x2d 0x6c 0x9f 0x0c
+0x2d 0x2c 0x9f 0x0c
+0x4f 0x7c 0x9f 0x4c
+0x4f 0xac 0x9f 0x4c
+0x4f 0x6c 0x9f 0x4c
+0x4f 0x2c 0x9f 0x4c
+
+# CHECK: st1.8b { v1 }, [x2], #8
+# CHECK: st1.8b { v1, v2 }, [x2], #16
+# CHECK: st1.8b { v1, v2, v3 }, [x2], #24
+# CHECK: st1.8b { v1, v2, v3, v4 }, [x2], #32
+# CHECK: st1.16b { v2 }, [x2], #16
+# CHECK: st1.16b { v2, v3 }, [x2], #32
+# CHECK: st1.16b { v2, v3, v4 }, [x2], #48
+# CHECK: st1.16b { v2, v3, v4, v5 }, [x2], #64
+# CHECK: st1.4h { v4 }, [x3], #8
+# CHECK: st1.4h { v4, v5 }, [x3], #16
+# CHECK: st1.4h { v4, v5, v6 }, [x3], #24
+# CHECK: st1.4h { v4, v5, v6, v7 }, [x3], #32
+# CHECK: st1.8h { v7 }, [x4], #16
+# CHECK: st1.8h { v7, v8 }, [x4], #32
+# CHECK: st1.8h { v7, v8, v9 }, [x4], #48
+# CHECK: st1.8h { v7, v8, v9, v10 }, [x4], #64
+# CHECK: st1.2s { v12 }, [x0], #8
+# CHECK: st1.2s { v12, v13 }, [x0], #16
+# CHECK: st1.2s { v12, v13, v14 }, [x0], #24
+# CHECK: st1.2s { v12, v13, v14, v15 }, [x0], #32
+# CHECK: st1.4s { v10 }, [x0], #16
+# CHECK: st1.4s { v10, v11 }, [x0], #32
+# CHECK: st1.4s { v10, v11, v12 }, [x0], #48
+# CHECK: st1.4s { v10, v11, v12, v13 }, [x0], #64
+# CHECK: st1.1d { v13 }, [x1], #8
+# CHECK: st1.1d { v13, v14 }, [x1], #16
+# CHECK: st1.1d { v13, v14, v15 }, [x1], #24
+# CHECK: st1.1d { v13, v14, v15, v16 }, [x1], #32
+# CHECK: st1.2d { v15 }, [x2], #16
+# CHECK: st1.2d { v15, v16 }, [x2], #32
+# CHECK: st1.2d { v15, v16, v17 }, [x2], #48
+# CHECK: st1.2d { v15, v16, v17, v18 }, [x2], #64
+
+0x21 0xc0 0x40 0x0d
+0x21 0xc0 0xc2 0x0d
+0x64 0xc4 0x40 0x0d
+0x64 0xc4 0xc5 0x0d
+0xa9 0xc8 0x40 0x0d
+0xa9 0xc8 0xc6 0x0d
+0xec 0xcc 0x40 0x0d
+0xec 0xcc 0xc8 0x0d
+
+# CHECK: ld1r.8b { v1 }, [x1]
+# CHECK: ld1r.8b { v1 }, [x1], x2
+# CHECK: ld1r.4h { v4 }, [x3]
+# CHECK: ld1r.4h { v4 }, [x3], x5
+# CHECK: ld1r.2s { v9 }, [x5]
+# CHECK: ld1r.2s { v9 }, [x5], x6
+# CHECK: ld1r.1d { v12 }, [x7]
+# CHECK: ld1r.1d { v12 }, [x7], x8
+
+0x21 0xc0 0xdf 0x0d
+0x21 0xc4 0xdf 0x0d
+0x21 0xc8 0xdf 0x0d
+0x21 0xcc 0xdf 0x0d
+
+# CHECK: ld1r.8b { v1 }, [x1], #1
+# CHECK: ld1r.4h { v1 }, [x1], #2
+# CHECK: ld1r.2s { v1 }, [x1], #4
+# CHECK: ld1r.1d { v1 }, [x1], #8
+
+0x45 0x80 0x40 0x4c
+0x0a 0x88 0x40 0x0c
+
+# CHECK: ld2.16b { v5, v6 }, [x2]
+# CHECK: ld2.2s { v10, v11 }, [x0]
+
+0x45 0x80 0x00 0x4c
+0x0a 0x88 0x00 0x0c
+
+# CHECK: st2.16b { v5, v6 }, [x2]
+# CHECK: st2.2s { v10, v11 }, [x0]
+
+0x61 0x08 0x20 0x0d
+0x82 0x84 0x20 0x4d
+0xc3 0x50 0x20 0x0d
+0xe4 0x90 0x20 0x4d
+
+# CHECK: st2.b { v1, v2 }[2], [x3]
+# CHECK: st2.d { v2, v3 }[1], [x4]
+# CHECK: st2.h { v3, v4 }[2], [x6]
+# CHECK: st2.s { v4, v5 }[3], [x7]
+
+0x61 0x08 0xbf 0x0d
+0x82 0x84 0xbf 0x4d
+0xa3 0x58 0xbf 0x0d
+0xc4 0x80 0xbf 0x4d
+
+# CHECK: st2.b { v1, v2 }[2], [x3], #2
+# CHECK: st2.d { v2, v3 }[1], [x4], #16
+# CHECK: st2.h { v3, v4 }[3], [x5], #4
+# CHECK: st2.s { v4, v5 }[2], [x6], #8
+
+0x61 0x08 0x60 0x0d
+0x82 0x84 0x60 0x4d
+0xc3 0x50 0x60 0x0d
+0xe4 0x90 0x60 0x4d
+
+# CHECK: ld2.b { v1, v2 }[2], [x3]
+# CHECK: ld2.d { v2, v3 }[1], [x4]
+# CHECK: ld2.h { v3, v4 }[2], [x6]
+# CHECK: ld2.s { v4, v5 }[3], [x7]
+
+0x61 0x08 0xff 0x0d
+0x82 0x84 0xff 0x4d
+0xa3 0x58 0xff 0x0d
+0xc4 0x80 0xff 0x4d
+
+# CHECK: ld2.b { v1, v2 }[2], [x3], #2
+# CHECK: ld2.d { v2, v3 }[1], [x4], #16
+# CHECK: ld2.h { v3, v4 }[3], [x5], #4
+# CHECK: ld2.s { v4, v5 }[2], [x6], #8
+
+0x61 0x08 0xe4 0x0d
+0x82 0x84 0xe6 0x4d
+0xa3 0x58 0xe8 0x0d
+0xc4 0x80 0xea 0x4d
+
+# CHECK: ld2.b { v1, v2 }[2], [x3], x4
+# CHECK: ld2.d { v2, v3 }[1], [x4], x6
+# CHECK: ld2.h { v3, v4 }[3], [x5], x8
+# CHECK: ld2.s { v4, v5 }[2], [x6], x10
+
+0x61 0x08 0xa4 0x0d
+0x82 0x84 0xa6 0x4d
+0xa3 0x58 0xa8 0x0d
+0xc4 0x80 0xaa 0x4d
+
+# CHECK: st2.b { v1, v2 }[2], [x3], x4
+# CHECK: st2.d { v2, v3 }[1], [x4], x6
+# CHECK: st2.h { v3, v4 }[3], [x5], x8
+# CHECK: st2.s { v4, v5 }[2], [x6], x10
+
+0x64 0x84 0xc5 0x0c
+0x0c 0x88 0xc7 0x0c
+
+# CHECK: ld2.4h { v4, v5 }, [x3], x5
+# CHECK: ld2.2s { v12, v13 }, [x0], x7
+
+0x00 0x80 0xdf 0x0c
+0x00 0x80 0xdf 0x4c
+0x00 0x84 0xdf 0x0c
+0x00 0x84 0xdf 0x4c
+0x00 0x88 0xdf 0x0c
+0x00 0x88 0xdf 0x4c
+0x00 0x8c 0xdf 0x4c
+
+# CHECK: ld2.8b { v0, v1 }, [x0], #16
+# CHECK: ld2.16b { v0, v1 }, [x0], #32
+# CHECK: ld2.4h { v0, v1 }, [x0], #16
+# CHECK: ld2.8h { v0, v1 }, [x0], #32
+# CHECK: ld2.2s { v0, v1 }, [x0], #16
+# CHECK: ld2.4s { v0, v1 }, [x0], #32
+# CHECK: ld2.2d { v0, v1 }, [x0], #32
+
+0x64 0x84 0x85 0x0c
+0x0c 0x88 0x87 0x0c
+
+# CHECK: st2.4h { v4, v5 }, [x3], x5
+# CHECK: st2.2s { v12, v13 }, [x0], x7
+
+0x00 0x80 0x9f 0x0c
+0x00 0x80 0x9f 0x4c
+0x00 0x84 0x9f 0x0c
+0x00 0x84 0x9f 0x4c
+0x00 0x88 0x9f 0x0c
+0x00 0x88 0x9f 0x4c
+0x00 0x8c 0x9f 0x4c
+
+# CHECK: st2.8b { v0, v1 }, [x0], #16
+# CHECK: st2.16b { v0, v1 }, [x0], #32
+# CHECK: st2.4h { v0, v1 }, [x0], #16
+# CHECK: st2.8h { v0, v1 }, [x0], #32
+# CHECK: st2.2s { v0, v1 }, [x0], #16
+# CHECK: st2.4s { v0, v1 }, [x0], #32
+# CHECK: st2.2d { v0, v1 }, [x0], #32
+
+0x21 0xc0 0x60 0x0d
+0x21 0xc0 0xe2 0x0d
+0x21 0xc0 0x60 0x4d
+0x21 0xc0 0xe2 0x4d
+0x21 0xc4 0x60 0x0d
+0x21 0xc4 0xe2 0x0d
+0x21 0xc4 0x60 0x4d
+0x21 0xc4 0xe2 0x4d
+0x21 0xc8 0x60 0x0d
+0x21 0xc8 0xe2 0x0d
+0x21 0xcc 0x60 0x4d
+0x21 0xcc 0xe2 0x4d
+0x21 0xcc 0x60 0x0d
+0x21 0xcc 0xe2 0x0d
+
+# CHECK: ld2r.8b { v1, v2 }, [x1]
+# CHECK: ld2r.8b { v1, v2 }, [x1], x2
+# CHECK: ld2r.16b { v1, v2 }, [x1]
+# CHECK: ld2r.16b { v1, v2 }, [x1], x2
+# CHECK: ld2r.4h { v1, v2 }, [x1]
+# CHECK: ld2r.4h { v1, v2 }, [x1], x2
+# CHECK: ld2r.8h { v1, v2 }, [x1]
+# CHECK: ld2r.8h { v1, v2 }, [x1], x2
+# CHECK: ld2r.2s { v1, v2 }, [x1]
+# CHECK: ld2r.2s { v1, v2 }, [x1], x2
+# CHECK: ld2r.2d { v1, v2 }, [x1]
+# CHECK: ld2r.2d { v1, v2 }, [x1], x2
+# CHECK: ld2r.1d { v1, v2 }, [x1]
+# CHECK: ld2r.1d { v1, v2 }, [x1], x2
+
+0x21 0xc0 0xff 0x0d
+0x21 0xc0 0xff 0x4d
+0x21 0xc4 0xff 0x0d
+0x21 0xc4 0xff 0x4d
+0x21 0xc8 0xff 0x0d
+0x21 0xcc 0xff 0x4d
+0x21 0xcc 0xff 0x0d
+
+# CHECK: ld2r.8b { v1, v2 }, [x1], #2
+# CHECK: ld2r.16b { v1, v2 }, [x1], #2
+# CHECK: ld2r.4h { v1, v2 }, [x1], #4
+# CHECK: ld2r.8h { v1, v2 }, [x1], #4
+# CHECK: ld2r.2s { v1, v2 }, [x1], #8
+# CHECK: ld2r.2d { v1, v2 }, [x1], #16
+# CHECK: ld2r.1d { v1, v2 }, [x1], #16
+
+0x21 0x40 0x40 0x0c
+0x45 0x40 0x40 0x4c
+0x0a 0x48 0x40 0x0c
+
+# CHECK: ld3.8b { v1, v2, v3 }, [x1]
+# CHECK: ld3.16b { v5, v6, v7 }, [x2]
+# CHECK: ld3.2s { v10, v11, v12 }, [x0]
+
+0x21 0x40 0x00 0x0c
+0x45 0x40 0x00 0x4c
+0x0a 0x48 0x00 0x0c
+
+# CHECK: st3.8b { v1, v2, v3 }, [x1]
+# CHECK: st3.16b { v5, v6, v7 }, [x2]
+# CHECK: st3.2s { v10, v11, v12 }, [x0]
+
+0x61 0x28 0xc4 0x0d
+0x82 0xa4 0xc5 0x4d
+0xa3 0x78 0xc6 0x0d
+0xc4 0xa0 0xc7 0x4d
+
+# CHECK: ld3.b { v1, v2, v3 }[2], [x3], x4
+# CHECK: ld3.d { v2, v3, v4 }[1], [x4], x5
+# CHECK: ld3.h { v3, v4, v5 }[3], [x5], x6
+# CHECK: ld3.s { v4, v5, v6 }[2], [x6], x7
+
+0x61 0x28 0x84 0x0d
+0x82 0xa4 0x85 0x4d
+0xa3 0x78 0x86 0x0d
+0xc4 0xa0 0x87 0x4d
+
+# CHECK: st3.b { v1, v2, v3 }[2], [x3], x4
+# CHECK: st3.d { v2, v3, v4 }[1], [x4], x5
+# CHECK: st3.h { v3, v4, v5 }[3], [x5], x6
+# CHECK: st3.s { v4, v5, v6 }[2], [x6], x7
+
+0x61 0x28 0x9f 0x0d
+0x82 0xa4 0x9f 0x4d
+0xa3 0x78 0x9f 0x0d
+0xc4 0xa0 0x9f 0x4d
+
+# CHECK: st3.b { v1, v2, v3 }[2], [x3], #3
+# CHECK: st3.d { v2, v3, v4 }[1], [x4], #24
+# CHECK: st3.h { v3, v4, v5 }[3], [x5], #6
+# CHECK: st3.s { v4, v5, v6 }[2], [x6], #12
+
+0x41 0x40 0xc3 0x0c
+0x42 0x40 0xc4 0x4c
+0x64 0x44 0xc5 0x0c
+0x87 0x44 0xc6 0x4c
+0x0c 0x48 0xc7 0x0c
+0x0a 0x48 0xc8 0x4c
+0x4f 0x4c 0xca 0x4c
+
+# CHECK: ld3.8b { v1, v2, v3 }, [x2], x3
+# CHECK: ld3.16b { v2, v3, v4 }, [x2], x4
+# CHECK: ld3.4h { v4, v5, v6 }, [x3], x5
+# CHECK: ld3.8h { v7, v8, v9 }, [x4], x6
+# CHECK: ld3.2s { v12, v13, v14 }, [x0], x7
+# CHECK: ld3.4s { v10, v11, v12 }, [x0], x8
+# CHECK: ld3.2d { v15, v16, v17 }, [x2], x10
+
+0x00 0x40 0xdf 0x0c
+0x00 0x40 0xdf 0x4c
+0x00 0x44 0xdf 0x0c
+0x00 0x44 0xdf 0x4c
+0x00 0x48 0xdf 0x0c
+0x00 0x48 0xdf 0x4c
+0x00 0x4c 0xdf 0x4c
+
+# CHECK: ld3.8b { v0, v1, v2 }, [x0], #24
+# CHECK: ld3.16b { v0, v1, v2 }, [x0], #48
+# CHECK: ld3.4h { v0, v1, v2 }, [x0], #24
+# CHECK: ld3.8h { v0, v1, v2 }, [x0], #48
+# CHECK: ld3.2s { v0, v1, v2 }, [x0], #24
+# CHECK: ld3.4s { v0, v1, v2 }, [x0], #48
+# CHECK: ld3.2d { v0, v1, v2 }, [x0], #48
+
+0x41 0x40 0x83 0x0c
+0x42 0x40 0x84 0x4c
+0x64 0x44 0x85 0x0c
+0x87 0x44 0x86 0x4c
+0x0c 0x48 0x87 0x0c
+0x0a 0x48 0x88 0x4c
+0x4f 0x4c 0x8a 0x4c
+
+# CHECK: st3.8b { v1, v2, v3 }, [x2], x3
+# CHECK: st3.16b { v2, v3, v4 }, [x2], x4
+# CHECK: st3.4h { v4, v5, v6 }, [x3], x5
+# CHECK: st3.8h { v7, v8, v9 }, [x4], x6
+# CHECK: st3.2s { v12, v13, v14 }, [x0], x7
+# CHECK: st3.4s { v10, v11, v12 }, [x0], x8
+# CHECK: st3.2d { v15, v16, v17 }, [x2], x10
+
+0x00 0x40 0x9f 0x0c
+0x00 0x40 0x9f 0x4c
+0x00 0x44 0x9f 0x0c
+0x00 0x44 0x9f 0x4c
+0x00 0x48 0x9f 0x0c
+0x00 0x48 0x9f 0x4c
+0x00 0x4c 0x9f 0x4c
+
+# CHECK: st3.8b { v0, v1, v2 }, [x0], #24
+# CHECK: st3.16b { v0, v1, v2 }, [x0], #48
+# CHECK: st3.4h { v0, v1, v2 }, [x0], #24
+# CHECK: st3.8h { v0, v1, v2 }, [x0], #48
+# CHECK: st3.2s { v0, v1, v2 }, [x0], #24
+# CHECK: st3.4s { v0, v1, v2 }, [x0], #48
+# CHECK: st3.2d { v0, v1, v2 }, [x0], #48
+
+0x61 0x28 0x40 0x0d
+0x82 0xa4 0x40 0x4d
+0xc3 0x70 0x40 0x0d
+0xe4 0xb0 0x40 0x4d
+
+# CHECK: ld3.b { v1, v2, v3 }[2], [x3]
+# CHECK: ld3.d { v2, v3, v4 }[1], [x4]
+# CHECK: ld3.h { v3, v4, v5 }[2], [x6]
+# CHECK: ld3.s { v4, v5, v6 }[3], [x7]
+
+0x61 0x28 0xdf 0x0d
+0x82 0xa4 0xdf 0x4d
+0xa3 0x78 0xdf 0x0d
+0xc4 0xa0 0xdf 0x4d
+
+# CHECK: ld3.b { v1, v2, v3 }[2], [x3], #3
+# CHECK: ld3.d { v2, v3, v4 }[1], [x4], #24
+# CHECK: ld3.h { v3, v4, v5 }[3], [x5], #6
+# CHECK: ld3.s { v4, v5, v6 }[2], [x6], #12
+
+0x61 0x28 0x00 0x0d
+0x82 0xa4 0x00 0x4d
+0xc3 0x70 0x00 0x0d
+0xe4 0xb0 0x00 0x4d
+
+# CHECK: st3.b { v1, v2, v3 }[2], [x3]
+# CHECK: st3.d { v2, v3, v4 }[1], [x4]
+# CHECK: st3.h { v3, v4, v5 }[2], [x6]
+# CHECK: st3.s { v4, v5, v6 }[3], [x7]
+
+0x21 0xe0 0x40 0x0d
+0x21 0xe0 0xc2 0x0d
+0x21 0xe0 0x40 0x4d
+0x21 0xe0 0xc2 0x4d
+0x21 0xe4 0x40 0x0d
+0x21 0xe4 0xc2 0x0d
+0x21 0xe4 0x40 0x4d
+0x21 0xe4 0xc2 0x4d
+0x21 0xe8 0x40 0x0d
+0x21 0xe8 0xc2 0x0d
+0x21 0xec 0x40 0x4d
+0x21 0xec 0xc2 0x4d
+0x21 0xec 0x40 0x0d
+0x21 0xec 0xc2 0x0d
+
+# CHECK: ld3r.8b { v1, v2, v3 }, [x1]
+# CHECK: ld3r.8b { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.16b { v1, v2, v3 }, [x1]
+# CHECK: ld3r.16b { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.4h { v1, v2, v3 }, [x1]
+# CHECK: ld3r.4h { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.8h { v1, v2, v3 }, [x1]
+# CHECK: ld3r.8h { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.2s { v1, v2, v3 }, [x1]
+# CHECK: ld3r.2s { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.2d { v1, v2, v3 }, [x1]
+# CHECK: ld3r.2d { v1, v2, v3 }, [x1], x2
+# CHECK: ld3r.1d { v1, v2, v3 }, [x1]
+# CHECK: ld3r.1d { v1, v2, v3 }, [x1], x2
+
+0x21 0xe0 0xdf 0x0d
+0x21 0xe0 0xdf 0x4d
+0x21 0xe4 0xdf 0x0d
+0x21 0xe4 0xdf 0x4d
+0x21 0xe8 0xdf 0x0d
+0x21 0xec 0xdf 0x4d
+0x21 0xec 0xdf 0x0d
+
+# CHECK: ld3r.8b	{ v1, v2, v3 }, [x1], #3
+# CHECK: ld3r.16b	{ v1, v2, v3 }, [x1], #3
+# CHECK: ld3r.4h	{ v1, v2, v3 }, [x1], #6
+# CHECK: ld3r.8h	{ v1, v2, v3 }, [x1], #6
+# CHECK: ld3r.2s	{ v1, v2, v3 }, [x1], #12
+# CHECK: ld3r.2d	{ v1, v2, v3 }, [x1], #24
+# CHECK: ld3r.1d	{ v1, v2, v3 }, [x1], #24
+
+0x21 0x00 0x40 0x0c
+0x45 0x00 0x40 0x4c
+0x0a 0x08 0x40 0x0c
+
+# CHECK: ld4.8b { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4.16b { v5, v6, v7, v8 }, [x2]
+# CHECK: ld4.2s { v10, v11, v12, v13 }, [x0]
+
+0x21 0x00 0x00 0x0c
+0x45 0x00 0x00 0x4c
+0x0a 0x08 0x00 0x0c
+
+# CHECK: st4.8b { v1, v2, v3, v4 }, [x1]
+# CHECK: st4.16b { v5, v6, v7, v8 }, [x2]
+# CHECK: st4.2s { v10, v11, v12, v13 }, [x0]
+
+0x61 0x28 0xe4 0x0d
+0x82 0xa4 0xe5 0x4d
+0xa3 0x78 0xe6 0x0d
+0xc4 0xa0 0xe7 0x4d
+
+# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3], x4
+# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4], x5
+# CHECK: ld4.h { v3, v4, v5, v6 }[3], [x5], x6
+# CHECK: ld4.s { v4, v5, v6, v7 }[2], [x6], x7
+
+0x61 0x28 0xff 0x0d
+0x82 0xa4 0xff 0x4d
+0xa3 0x78 0xff 0x0d
+0xc4 0xa0 0xff 0x4d
+
+# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3], #4
+# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4], #32
+# CHECK: ld4.h { v3, v4, v5, v6 }[3], [x5], #8
+# CHECK: ld4.s { v4, v5, v6, v7 }[2], [x6], #16
+
+0x61 0x28 0xa4 0x0d
+0x82 0xa4 0xa5 0x4d
+0xa3 0x78 0xa6 0x0d
+0xc4 0xa0 0xa7 0x4d
+
+# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3], x4
+# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4], x5
+# CHECK: st4.h { v3, v4, v5, v6 }[3], [x5], x6
+# CHECK: st4.s { v4, v5, v6, v7 }[2], [x6], x7
+
+0x61 0x28 0xbf 0x0d
+0x82 0xa4 0xbf 0x4d
+0xa3 0x78 0xbf 0x0d
+0xc4 0xa0 0xbf 0x4d
+
+# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3], #4
+# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4], #32
+# CHECK: st4.h { v3, v4, v5, v6 }[3], [x5], #8
+# CHECK: st4.s { v4, v5, v6, v7 }[2], [x6], #16
+
+0x41 0x00 0xc3 0x0c
+0x42 0x00 0xc4 0x4c
+0x64 0x04 0xc5 0x0c
+0x87 0x04 0xc6 0x4c
+0x0c 0x08 0xc7 0x0c
+0x0a 0x08 0xc8 0x4c
+0x4f 0x0c 0xca 0x4c
+
+# CHECK: ld4.8b { v1, v2, v3, v4 }, [x2], x3
+# CHECK: ld4.16b { v2, v3, v4, v5 }, [x2], x4
+# CHECK: ld4.4h { v4, v5, v6, v7 }, [x3], x5
+# CHECK: ld4.8h { v7, v8, v9, v10 }, [x4], x6
+# CHECK: ld4.2s { v12, v13, v14, v15 }, [x0], x7
+# CHECK: ld4.4s { v10, v11, v12, v13 }, [x0], x8
+# CHECK: ld4.2d { v15, v16, v17, v18 }, [x2], x10
+
+0x00 0x00 0xdf 0x0c
+0x00 0x00 0xdf 0x4c
+0x00 0x04 0xdf 0x0c
+0x00 0x04 0xdf 0x4c
+0x00 0x08 0xdf 0x0c
+0x00 0x08 0xdf 0x4c
+0x00 0x0c 0xdf 0x4c
+
+# CHECK: ld4.8b { v0, v1, v2, v3 }, [x0], #32
+# CHECK: ld4.16b { v0, v1, v2, v3 }, [x0], #64
+# CHECK: ld4.4h { v0, v1, v2, v3 }, [x0], #32
+# CHECK: ld4.8h { v0, v1, v2, v3 }, [x0], #64
+# CHECK: ld4.2s { v0, v1, v2, v3 }, [x0], #32
+# CHECK: ld4.4s { v0, v1, v2, v3 }, [x0], #64
+# CHECK: ld4.2d { v0, v1, v2, v3 }, [x0], #64
+
+0x00 0x00 0x9f 0x0c
+0x00 0x00 0x9f 0x4c
+0x00 0x04 0x9f 0x0c
+0x00 0x04 0x9f 0x4c
+0x00 0x08 0x9f 0x0c
+0x00 0x08 0x9f 0x4c
+0x00 0x0c 0x9f 0x4c
+
+# CHECK: st4.8b { v0, v1, v2, v3 }, [x0], #32
+# CHECK: st4.16b { v0, v1, v2, v3 }, [x0], #64
+# CHECK: st4.4h { v0, v1, v2, v3 }, [x0], #32
+# CHECK: st4.8h { v0, v1, v2, v3 }, [x0], #64
+# CHECK: st4.2s { v0, v1, v2, v3 }, [x0], #32
+# CHECK: st4.4s { v0, v1, v2, v3 }, [x0], #64
+# CHECK: st4.2d { v0, v1, v2, v3 }, [x0], #64
+
+0x41 0x00 0x83 0x0c
+0x42 0x00 0x84 0x4c
+0x64 0x04 0x85 0x0c
+0x87 0x04 0x86 0x4c
+0x0c 0x08 0x87 0x0c
+0x0a 0x08 0x88 0x4c
+0x4f 0x0c 0x8a 0x4c
+
+# CHECK: st4.8b { v1, v2, v3, v4 }, [x2], x3
+# CHECK: st4.16b { v2, v3, v4, v5 }, [x2], x4
+# CHECK: st4.4h { v4, v5, v6, v7 }, [x3], x5
+# CHECK: st4.8h { v7, v8, v9, v10 }, [x4], x6
+# CHECK: st4.2s { v12, v13, v14, v15 }, [x0], x7
+# CHECK: st4.4s { v10, v11, v12, v13 }, [x0], x8
+# CHECK: st4.2d { v15, v16, v17, v18 }, [x2], x10
+
+0x61 0x28 0x60 0x0d
+0x82 0xa4 0x60 0x4d
+0xc3 0x70 0x60 0x0d
+0xe4 0xb0 0x60 0x4d
+
+# CHECK: ld4.b { v1, v2, v3, v4 }[2], [x3]
+# CHECK: ld4.d { v2, v3, v4, v5 }[1], [x4]
+# CHECK: ld4.h { v3, v4, v5, v6 }[2], [x6]
+# CHECK: ld4.s { v4, v5, v6, v7 }[3], [x7]
+
+0x61 0x28 0x20 0x0d
+0x82 0xa4 0x20 0x4d
+0xc3 0x70 0x20 0x0d
+0xe4 0xb0 0x20 0x4d
+
+# CHECK: st4.b { v1, v2, v3, v4 }[2], [x3]
+# CHECK: st4.d { v2, v3, v4, v5 }[1], [x4]
+# CHECK: st4.h { v3, v4, v5, v6 }[2], [x6]
+# CHECK: st4.s { v4, v5, v6, v7 }[3], [x7]
+
+0x21 0xe0 0x60 0x0d
+0x21 0xe0 0xe2 0x0d
+0x21 0xe0 0x60 0x4d
+0x21 0xe0 0xe2 0x4d
+0x21 0xe4 0x60 0x0d
+0x21 0xe4 0xe2 0x0d
+0x21 0xe4 0x60 0x4d
+0x21 0xe4 0xe2 0x4d
+0x21 0xe8 0x60 0x0d
+0x21 0xe8 0xe2 0x0d
+0x21 0xec 0x60 0x4d
+0x21 0xec 0xe2 0x4d
+0x21 0xec 0x60 0x0d
+0x21 0xec 0xe2 0x0d
+
+# CHECK: ld4r.8b { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.8b { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.16b { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.16b { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.4h { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.4h { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.8h { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.8h { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.2s { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.2s { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.2d { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.2d { v1, v2, v3, v4 }, [x1], x2
+# CHECK: ld4r.1d { v1, v2, v3, v4 }, [x1]
+# CHECK: ld4r.1d { v1, v2, v3, v4 }, [x1], x2
+
+0x21 0xe0 0xff 0x0d
+0x21 0xe0 0xff 0x4d
+0x21 0xe4 0xff 0x0d
+0x21 0xe4 0xff 0x4d
+0x21 0xe8 0xff 0x0d
+0x21 0xec 0xff 0x4d
+0x21 0xec 0xff 0x0d
+
+# CHECK: ld4r.8b	{ v1, v2, v3, v4 }, [x1], #4
+# CHECK: ld4r.16b	{ v1, v2, v3, v4 }, [x1], #4
+# CHECK: ld4r.4h	{ v1, v2, v3, v4 }, [x1], #8
+# CHECK: ld4r.8h	{ v1, v2, v3, v4 }, [x1], #8
+# CHECK: ld4r.2s	{ v1, v2, v3, v4 }, [x1], #16
+# CHECK: ld4r.2d	{ v1, v2, v3, v4 }, [x1], #32
+# CHECK: ld4r.1d	{ v1, v2, v3, v4 }, [x1], #32
+
+0x20 0xe4 0x00 0x2f
+0x20 0xe4 0x00 0x6f
+0x20 0xe4 0x00 0x0f
+0x20 0xe4 0x00 0x4f
+
+# CHECK: movi     d0, #0x000000000000ff
+# CHECK: movi.2d  v0, #0x000000000000ff
+# CHECK: movi.8b  v0, #1
+# CHECK: movi.16b v0, #1
+
+0x20 0x04 0x00 0x0f
+0x20 0x24 0x00 0x0f
+0x20 0x44 0x00 0x0f
+0x20 0x64 0x00 0x0f
+
+# CHECK: movi.2s v0, #1
+# CHECK: movi.2s v0, #1, lsl #8
+# CHECK: movi.2s v0, #1, lsl #16
+# CHECK: movi.2s v0, #1, lsl #24
+
+0x20 0x04 0x00 0x4f
+0x20 0x24 0x00 0x4f
+0x20 0x44 0x00 0x4f
+0x20 0x64 0x00 0x4f
+
+# CHECK: movi.4s v0, #1
+# CHECK: movi.4s v0, #1, lsl #8
+# CHECK: movi.4s v0, #1, lsl #16
+# CHECK: movi.4s v0, #1, lsl #24
+
+0x20 0x84 0x00 0x0f
+0x20 0xa4 0x00 0x0f
+
+# CHECK: movi.4h v0, #1
+# CHECK: movi.4h v0, #1, lsl #8
+
+0x20 0x84 0x00 0x4f
+0x20 0xa4 0x00 0x4f
+
+# CHECK: movi.8h v0, #1
+# CHECK: movi.8h v0, #1, lsl #8
+
+0x20 0x04 0x00 0x2f
+0x20 0x24 0x00 0x2f
+0x20 0x44 0x00 0x2f
+0x20 0x64 0x00 0x2f
+
+# CHECK: mvni.2s v0, #1
+# CHECK: mvni.2s v0, #1, lsl #8
+# CHECK: mvni.2s v0, #1, lsl #16
+# CHECK: mvni.2s v0, #1, lsl #24
+
+0x20 0x04 0x00 0x6f
+0x20 0x24 0x00 0x6f
+0x20 0x44 0x00 0x6f
+0x20 0x64 0x00 0x6f
+
+# CHECK: mvni.4s v0, #1
+# CHECK: mvni.4s v0, #1, lsl #8
+# CHECK: mvni.4s v0, #1, lsl #16
+# CHECK: mvni.4s v0, #1, lsl #24
+
+0x20 0x84 0x00 0x2f
+0x20 0xa4 0x00 0x2f
+
+# CHECK: mvni.4h v0, #1
+# CHECK: mvni.4h v0, #1, lsl #8
+
+0x20 0x84 0x00 0x6f
+0x20 0xa4 0x00 0x6f
+
+# CHECK: mvni.8h v0, #1
+# CHECK: mvni.8h v0, #1, lsl #8
+
+0x20 0xc4 0x00 0x2f
+0x20 0xd4 0x00 0x2f
+0x20 0xc4 0x00 0x6f
+0x20 0xd4 0x00 0x6f
+
+# CHECK: mvni.2s v0, #1, msl #8
+# CHECK: mvni.2s v0, #1, msl #16
+# CHECK: mvni.4s v0, #1, msl #8
+# CHECK: mvni.4s v0, #1, msl #16
+
+0x00 0x88 0x21 0x2e
+0x00 0x98 0x21 0x2e
+0x00 0x98 0xa1 0x2e
+0x00 0x98 0x21 0x0e
+0x00 0x88 0x21 0x0e
+0x00 0x88 0xa1 0x0e
+0x00 0x98 0xa1 0x0e
+
+# CHECK: frinta.2s	v0, v0
+# CHECK: frintx.2s	v0, v0
+# CHECK: frinti.2s	v0, v0
+# CHECK: frintm.2s	v0, v0
+# CHECK: frintn.2s	v0, v0
+# CHECK: frintp.2s	v0, v0
+# CHECK: frintz.2s	v0, v0
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD scalar x index instructions
+#===-------------------------------------------------------------------------===
+
+0x00 0x18 0xa0 0x5f
+0x00 0x18 0xc0 0x5f
+0x00 0x58 0xa0 0x5f
+0x00 0x58 0xc0 0x5f
+0x00 0x98 0xa0 0x7f
+0x00 0x98 0xc0 0x7f
+0x00 0x98 0xa0 0x5f
+0x00 0x98 0xc0 0x5f
+0x00 0x38 0x70 0x5f
+0x00 0x38 0xa0 0x5f
+0x00 0x78 0x70 0x5f
+0x00 0xc8 0x70 0x5f
+0x00 0xc8 0xa0 0x5f
+0x00 0xb8 0x70 0x5f
+0x00 0xb8 0xa0 0x5f
+0x00 0xd8 0x70 0x5f
+0x00 0xd8 0xa0 0x5f
+
+# CHECK: fmla.s	s0, s0, v0[3]
+# CHECK: fmla.d	d0, d0, v0[1]
+# CHECK: fmls.s	s0, s0, v0[3]
+# CHECK: fmls.d	d0, d0, v0[1]
+# CHECK: fmulx.s	s0, s0, v0[3]
+# CHECK: fmulx.d	d0, d0, v0[1]
+# CHECK: fmul.s	s0, s0, v0[3]
+# CHECK: fmul.d	d0, d0, v0[1]
+# CHECK: sqdmlal.h	s0, h0, v0[7]
+# CHECK: sqdmlal.s	d0, s0, v0[3]
+# CHECK: sqdmlsl.h	s0, h0, v0[7]
+# CHECK: sqdmulh.h	h0, h0, v0[7]
+# CHECK: sqdmulh.s	s0, s0, v0[3]
+# CHECK: sqdmull.h	s0, h0, v0[7]
+# CHECK: sqdmull.s	d0, s0, v0[3]
+# CHECK: sqrdmulh.h	h0, h0, v0[7]
+# CHECK: sqrdmulh.s	s0, s0, v0[3]
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD vector x index instructions
+#===-------------------------------------------------------------------------===
+
+  0x00 0x10 0x80 0x0f
+  0x00 0x10 0xa0 0x4f
+  0x00 0x18 0xc0 0x4f
+  0x00 0x50 0x80 0x0f
+  0x00 0x50 0xa0 0x4f
+  0x00 0x58 0xc0 0x4f
+  0x00 0x90 0x80 0x2f
+  0x00 0x90 0xa0 0x6f
+  0x00 0x98 0xc0 0x6f
+  0x00 0x90 0x80 0x0f
+  0x00 0x90 0xa0 0x4f
+  0x00 0x98 0xc0 0x4f
+  0x00 0x00 0x40 0x2f
+  0x00 0x00 0x50 0x6f
+  0x00 0x08 0x80 0x2f
+  0x00 0x08 0xa0 0x6f
+  0x00 0x40 0x40 0x2f
+  0x00 0x40 0x50 0x6f
+  0x00 0x48 0x80 0x2f
+  0x00 0x48 0xa0 0x6f
+  0x00 0x80 0x40 0x0f
+  0x00 0x80 0x50 0x4f
+  0x00 0x88 0x80 0x0f
+  0x00 0x88 0xa0 0x4f
+  0x00 0x20 0x40 0x0f
+  0x00 0x20 0x50 0x4f
+  0x00 0x28 0x80 0x0f
+  0x00 0x28 0xa0 0x4f
+  0x00 0x60 0x40 0x0f
+  0x00 0x60 0x50 0x4f
+  0x00 0x68 0x80 0x0f
+  0x00 0x68 0xa0 0x4f
+  0x00 0xa0 0x40 0x0f
+  0x00 0xa0 0x50 0x4f
+  0x00 0xa8 0x80 0x0f
+  0x00 0xa8 0xa0 0x4f
+  0x00 0x30 0x40 0x0f
+  0x00 0x30 0x50 0x4f
+  0x00 0x38 0x80 0x0f
+  0x00 0x38 0xa0 0x4f
+  0x00 0x70 0x40 0x0f
+  0x00 0x70 0x50 0x4f
+  0x00 0x78 0x80 0x0f
+  0x00 0x78 0xa0 0x4f
+  0x00 0xc0 0x40 0x0f
+  0x00 0xc0 0x50 0x4f
+  0x00 0xc8 0x80 0x0f
+  0x00 0xc8 0xa0 0x4f
+  0x00 0xb0 0x40 0x0f
+  0x00 0xb0 0x50 0x4f
+  0x00 0xb8 0x80 0x0f
+  0x00 0xb8 0xa0 0x4f
+  0x00 0xd0 0x40 0x0f
+  0x00 0xd0 0x50 0x4f
+  0x00 0xd8 0x80 0x0f
+  0x00 0xd8 0xa0 0x4f
+  0x00 0x20 0x40 0x2f
+  0x00 0x20 0x50 0x6f
+  0x00 0x28 0x80 0x2f
+  0x00 0x28 0xa0 0x6f
+  0x00 0x60 0x40 0x2f
+  0x00 0x60 0x50 0x6f
+  0x00 0x68 0x80 0x2f
+  0x00 0x68 0xa0 0x6f
+  0x00 0xa0 0x40 0x2f
+  0x00 0xa0 0x50 0x6f
+  0x00 0xa8 0x80 0x2f
+  0x00 0xa8 0xa0 0x6f
+
+# CHECK: fmla.2s	v0, v0, v0[0]
+# CHECK: fmla.4s	v0, v0, v0[1]
+# CHECK: fmla.2d	v0, v0, v0[1]
+# CHECK: fmls.2s	v0, v0, v0[0]
+# CHECK: fmls.4s	v0, v0, v0[1]
+# CHECK: fmls.2d	v0, v0, v0[1]
+# CHECK: fmulx.2s	v0, v0, v0[0]
+# CHECK: fmulx.4s	v0, v0, v0[1]
+# CHECK: fmulx.2d	v0, v0, v0[1]
+# CHECK: fmul.2s	v0, v0, v0[0]
+# CHECK: fmul.4s	v0, v0, v0[1]
+# CHECK: fmul.2d	v0, v0, v0[1]
+# CHECK: mla.4h	v0, v0, v0[0]
+# CHECK: mla.8h	v0, v0, v0[1]
+# CHECK: mla.2s	v0, v0, v0[2]
+# CHECK: mla.4s	v0, v0, v0[3]
+# CHECK: mls.4h	v0, v0, v0[0]
+# CHECK: mls.8h	v0, v0, v0[1]
+# CHECK: mls.2s	v0, v0, v0[2]
+# CHECK: mls.4s	v0, v0, v0[3]
+# CHECK: mul.4h	v0, v0, v0[0]
+# CHECK: mul.8h	v0, v0, v0[1]
+# CHECK: mul.2s	v0, v0, v0[2]
+# CHECK: mul.4s	v0, v0, v0[3]
+# CHECK: smlal.4s	v0, v0, v0[0]
+# CHECK: smlal2.4s	v0, v0, v0[1]
+# CHECK: smlal.2d	v0, v0, v0[2]
+# CHECK: smlal2.2d	v0, v0, v0[3]
+# CHECK: smlsl.4s	v0, v0, v0[0]
+# CHECK: smlsl2.4s	v0, v0, v0[1]
+# CHECK: smlsl.2d	v0, v0, v0[2]
+# CHECK: smlsl2.2d	v0, v0, v0[3]
+# CHECK: smull.4s	v0, v0, v0[0]
+# CHECK: smull2.4s	v0, v0, v0[1]
+# CHECK: smull.2d	v0, v0, v0[2]
+# CHECK: smull2.2d	v0, v0, v0[3]
+# CHECK: sqdmlal.4s	v0, v0, v0[0]
+# CHECK: sqdmlal2.4s	v0, v0, v0[1]
+# CHECK: sqdmlal.2d	v0, v0, v0[2]
+# CHECK: sqdmlal2.2d	v0, v0, v0[3]
+# CHECK: sqdmlsl.4s	v0, v0, v0[0]
+# CHECK: sqdmlsl2.4s	v0, v0, v0[1]
+# CHECK: sqdmlsl.2d	v0, v0, v0[2]
+# CHECK: sqdmlsl2.2d	v0, v0, v0[3]
+# CHECK: sqdmulh.4h	v0, v0, v0[0]
+# CHECK: sqdmulh.8h	v0, v0, v0[1]
+# CHECK: sqdmulh.2s	v0, v0, v0[2]
+# CHECK: sqdmulh.4s	v0, v0, v0[3]
+# CHECK: sqdmull.4s	v0, v0, v0[0]
+# CHECK: sqdmull2.4s	v0, v0, v0[1]
+# CHECK: sqdmull.2d	v0, v0, v0[2]
+# CHECK: sqdmull2.2d	v0, v0, v0[3]
+# CHECK: sqrdmulh.4h	v0, v0, v0[0]
+# CHECK: sqrdmulh.8h	v0, v0, v0[1]
+# CHECK: sqrdmulh.2s	v0, v0, v0[2]
+# CHECK: sqrdmulh.4s	v0, v0, v0[3]
+# CHECK: umlal.4s	v0, v0, v0[0]
+# CHECK: umlal2.4s	v0, v0, v0[1]
+# CHECK: umlal.2d	v0, v0, v0[2]
+# CHECK: umlal2.2d	v0, v0, v0[3]
+# CHECK: umlsl.4s	v0, v0, v0[0]
+# CHECK: umlsl2.4s	v0, v0, v0[1]
+# CHECK: umlsl.2d	v0, v0, v0[2]
+# CHECK: umlsl2.2d	v0, v0, v0[3]
+# CHECK: umull.4s	v0, v0, v0[0]
+# CHECK: umull2.4s	v0, v0, v0[1]
+# CHECK: umull.2d	v0, v0, v0[2]
+# CHECK: umull2.2d	v0, v0, v0[3]
+
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD scalar + shift instructions
+#===-------------------------------------------------------------------------===
+
+  0x00 0x54 0x41 0x5f
+  0x00 0x54 0x41 0x7f
+  0x00 0x9c 0x09 0x5f
+  0x00 0x9c 0x12 0x5f
+  0x00 0x9c 0x23 0x5f
+  0x00 0x8c 0x09 0x7f
+  0x00 0x8c 0x12 0x7f
+  0x00 0x8c 0x23 0x7f
+  0x00 0x64 0x09 0x7f
+  0x00 0x64 0x12 0x7f
+  0x00 0x64 0x23 0x7f
+  0x00 0x64 0x44 0x7f
+  0x00 0x74 0x09 0x5f
+  0x00 0x74 0x12 0x5f
+  0x00 0x74 0x23 0x5f
+  0x00 0x74 0x44 0x5f
+  0x00 0x94 0x09 0x5f
+  0x00 0x94 0x12 0x5f
+  0x00 0x94 0x23 0x5f
+  0x00 0x84 0x09 0x7f
+  0x00 0x84 0x12 0x7f
+  0x00 0x84 0x23 0x7f
+  0x00 0x44 0x41 0x7f
+  0x00 0x24 0x41 0x5f
+  0x00 0x34 0x41 0x5f
+  0x00 0x04 0x41 0x5f
+  0x00 0xe4 0x21 0x7f
+  0x00 0xe4 0x42 0x7f
+  0x00 0x9c 0x09 0x7f
+  0x00 0x9c 0x12 0x7f
+  0x00 0x9c 0x23 0x7f
+  0x00 0x74 0x09 0x7f
+  0x00 0x74 0x12 0x7f
+  0x00 0x74 0x23 0x7f
+  0x00 0x74 0x44 0x7f
+  0x00 0x94 0x09 0x7f
+  0x00 0x94 0x12 0x7f
+  0x00 0x94 0x23 0x7f
+  0x00 0x24 0x41 0x7f
+  0x00 0x34 0x41 0x7f
+  0x00 0x04 0x41 0x7f
+  0x00 0x14 0x41 0x7f
+
+# CHECK: shl	d0, d0, #1
+# CHECK: sli	d0, d0, #1
+# CHECK: sqrshrn	b0, h0, #7
+# CHECK: sqrshrn	h0, s0, #14
+# CHECK: sqrshrn	s0, d0, #29
+# CHECK: sqrshrun	b0, h0, #7
+# CHECK: sqrshrun	h0, s0, #14
+# CHECK: sqrshrun	s0, d0, #29
+# CHECK: sqshlu	b0, b0, #1
+# CHECK: sqshlu	h0, h0, #2
+# CHECK: sqshlu	s0, s0, #3
+# CHECK: sqshlu	d0, d0, #4
+# CHECK: sqshl	b0, b0, #1
+# CHECK: sqshl	h0, h0, #2
+# CHECK: sqshl	s0, s0, #3
+# CHECK: sqshl	d0, d0, #4
+# CHECK: sqshrn	b0, h0, #7
+# CHECK: sqshrn	h0, s0, #14
+# CHECK: sqshrn	s0, d0, #29
+# CHECK: sqshrun	b0, h0, #7
+# CHECK: sqshrun	h0, s0, #14
+# CHECK: sqshrun	s0, d0, #29
+# CHECK: sri	d0, d0, #63
+# CHECK: srshr	d0, d0, #63
+# CHECK: srsra	d0, d0, #63
+# CHECK: sshr	d0, d0, #63
+# CHECK: ucvtf	s0, s0, #31
+# CHECK: ucvtf	d0, d0, #62
+# CHECK: uqrshrn	b0, h0, #7
+# CHECK: uqrshrn	h0, s0, #14
+# CHECK: uqrshrn	s0, d0, #29
+# CHECK: uqshl	b0, b0, #1
+# CHECK: uqshl	h0, h0, #2
+# CHECK: uqshl	s0, s0, #3
+# CHECK: uqshl	d0, d0, #4
+# CHECK: uqshrn	b0, h0, #7
+# CHECK: uqshrn	h0, s0, #14
+# CHECK: uqshrn	s0, d0, #29
+# CHECK: urshr	d0, d0, #63
+# CHECK: ursra	d0, d0, #63
+# CHECK: ushr	d0, d0, #63
+# CHECK: usra	d0, d0, #63
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD vector + shift instructions
+#===-------------------------------------------------------------------------===
+
+  0x00 0xfc 0x21 0x0f
+  0x00 0xfc 0x22 0x4f
+  0x00 0xfc 0x43 0x4f
+  0x00 0xfc 0x21 0x2f
+  0x00 0xfc 0x22 0x6f
+  0x00 0xfc 0x43 0x6f
+  0x00 0x8c 0x09 0x0f
+  0x00 0x8c 0x0a 0x4f
+  0x00 0x8c 0x13 0x0f
+  0x00 0x8c 0x14 0x4f
+  0x00 0x8c 0x25 0x0f
+  0x00 0x8c 0x26 0x4f
+  0x00 0xe4 0x21 0x0f
+  0x00 0xe4 0x22 0x4f
+  0x00 0xe4 0x43 0x4f
+  0x00 0x54 0x09 0x0f
+  0x00 0x54 0x0a 0x4f
+  0x00 0x54 0x13 0x0f
+  0x00 0x54 0x14 0x4f
+  0x00 0x54 0x25 0x0f
+  0x00 0x54 0x26 0x4f
+  0x00 0x54 0x47 0x4f
+  0x00 0x84 0x09 0x0f
+  0x00 0x84 0x0a 0x4f
+  0x00 0x84 0x13 0x0f
+  0x00 0x84 0x14 0x4f
+  0x00 0x84 0x25 0x0f
+  0x00 0x84 0x26 0x4f
+  0x00 0x54 0x09 0x2f
+  0x00 0x54 0x0a 0x6f
+  0x00 0x54 0x13 0x2f
+  0x00 0x54 0x14 0x6f
+  0x00 0x54 0x25 0x2f
+  0x00 0x54 0x26 0x6f
+  0x00 0x54 0x47 0x6f
+  0x00 0x9c 0x09 0x0f
+  0x00 0x9c 0x0a 0x4f
+  0x00 0x9c 0x13 0x0f
+  0x00 0x9c 0x14 0x4f
+  0x00 0x9c 0x25 0x0f
+  0x00 0x9c 0x26 0x4f
+  0x00 0x8c 0x09 0x2f
+  0x00 0x8c 0x0a 0x6f
+  0x00 0x8c 0x13 0x2f
+  0x00 0x8c 0x14 0x6f
+  0x00 0x8c 0x25 0x2f
+  0x00 0x8c 0x26 0x6f
+  0x00 0x64 0x09 0x2f
+  0x00 0x64 0x0a 0x6f
+  0x00 0x64 0x13 0x2f
+  0x00 0x64 0x14 0x6f
+  0x00 0x64 0x25 0x2f
+  0x00 0x64 0x26 0x6f
+  0x00 0x64 0x47 0x6f
+  0x00 0x74 0x09 0x0f
+  0x00 0x74 0x0a 0x4f
+  0x00 0x74 0x13 0x0f
+  0x00 0x74 0x14 0x4f
+  0x00 0x74 0x25 0x0f
+  0x00 0x74 0x26 0x4f
+  0x00 0x74 0x47 0x4f
+  0x00 0x94 0x09 0x0f
+  0x00 0x94 0x0a 0x4f
+  0x00 0x94 0x13 0x0f
+  0x00 0x94 0x14 0x4f
+  0x00 0x94 0x25 0x0f
+  0x00 0x94 0x26 0x4f
+  0x00 0x84 0x09 0x2f
+  0x00 0x84 0x0a 0x6f
+  0x00 0x84 0x13 0x2f
+  0x00 0x84 0x14 0x6f
+  0x00 0x84 0x25 0x2f
+  0x00 0x84 0x26 0x6f
+  0x00 0x44 0x09 0x2f
+  0x00 0x44 0x0a 0x6f
+  0x00 0x44 0x13 0x2f
+  0x00 0x44 0x14 0x6f
+  0x00 0x44 0x25 0x2f
+  0x00 0x44 0x26 0x6f
+  0x00 0x44 0x47 0x6f
+  0x00 0x24 0x09 0x0f
+  0x00 0x24 0x0a 0x4f
+  0x00 0x24 0x13 0x0f
+  0x00 0x24 0x14 0x4f
+  0x00 0x24 0x25 0x0f
+  0x00 0x24 0x26 0x4f
+  0x00 0x24 0x47 0x4f
+  0x00 0x34 0x09 0x0f
+  0x00 0x34 0x0a 0x4f
+  0x00 0x34 0x13 0x0f
+  0x00 0x34 0x14 0x4f
+  0x00 0x34 0x25 0x0f
+  0x00 0x34 0x26 0x4f
+  0x00 0x34 0x47 0x4f
+  0x00 0xa4 0x09 0x0f
+  0x00 0xa4 0x0a 0x4f
+  0x00 0xa4 0x13 0x0f
+  0x00 0xa4 0x14 0x4f
+  0x00 0xa4 0x25 0x0f
+  0x00 0xa4 0x26 0x4f
+  0x00 0x04 0x09 0x0f
+  0x00 0x04 0x0a 0x4f
+  0x00 0x04 0x13 0x0f
+  0x00 0x04 0x14 0x4f
+  0x00 0x04 0x25 0x0f
+  0x00 0x04 0x26 0x4f
+  0x00 0x04 0x47 0x4f
+  0x00 0x04 0x09 0x0f
+  0x00 0x14 0x0a 0x4f
+  0x00 0x14 0x13 0x0f
+  0x00 0x14 0x14 0x4f
+  0x00 0x14 0x25 0x0f
+  0x00 0x14 0x26 0x4f
+  0x00 0x14 0x47 0x4f
+  0x00 0x14 0x40 0x5f
+  0x00 0xe4 0x21 0x2f
+  0x00 0xe4 0x22 0x6f
+  0x00 0xe4 0x43 0x6f
+  0x00 0x9c 0x09 0x2f
+  0x00 0x9c 0x0a 0x6f
+  0x00 0x9c 0x13 0x2f
+  0x00 0x9c 0x14 0x6f
+  0x00 0x9c 0x25 0x2f
+  0x00 0x9c 0x26 0x6f
+  0x00 0x74 0x09 0x2f
+  0x00 0x74 0x0a 0x6f
+  0x00 0x74 0x13 0x2f
+  0x00 0x74 0x14 0x6f
+  0x00 0x74 0x25 0x2f
+  0x00 0x74 0x26 0x6f
+  0x00 0x74 0x47 0x6f
+  0x00 0x94 0x09 0x2f
+  0x00 0x94 0x0a 0x6f
+  0x00 0x94 0x13 0x2f
+  0x00 0x94 0x14 0x6f
+  0x00 0x94 0x25 0x2f
+  0x00 0x94 0x26 0x6f
+  0x00 0x24 0x09 0x2f
+  0x00 0x24 0x0a 0x6f
+  0x00 0x24 0x13 0x2f
+  0x00 0x24 0x14 0x6f
+  0x00 0x24 0x25 0x2f
+  0x00 0x24 0x26 0x6f
+  0x00 0x24 0x47 0x6f
+  0x00 0x34 0x09 0x2f
+  0x00 0x34 0x0a 0x6f
+  0x00 0x34 0x13 0x2f
+  0x00 0x34 0x14 0x6f
+  0x00 0x34 0x25 0x2f
+  0x00 0x34 0x26 0x6f
+  0x00 0x34 0x47 0x6f
+  0x00 0xa4 0x09 0x2f
+  0x00 0xa4 0x0a 0x6f
+  0x00 0xa4 0x13 0x2f
+  0x00 0xa4 0x14 0x6f
+  0x00 0xa4 0x25 0x2f
+  0x00 0xa4 0x26 0x6f
+  0x00 0x04 0x09 0x2f
+  0x00 0x04 0x0a 0x6f
+  0x00 0x04 0x13 0x2f
+  0x00 0x04 0x14 0x6f
+  0x00 0x04 0x25 0x2f
+  0x00 0x04 0x26 0x6f
+  0x00 0x04 0x47 0x6f
+  0x00 0x14 0x09 0x2f
+  0x00 0x14 0x0a 0x6f
+  0x00 0x14 0x13 0x2f
+  0x00 0x14 0x14 0x6f
+  0x00 0x14 0x25 0x2f
+  0x00 0x14 0x26 0x6f
+  0x00 0x14 0x47 0x6f
+
+# CHECK: fcvtzs.2s	v0, v0, #31
+# CHECK: fcvtzs.4s	v0, v0, #30
+# CHECK: fcvtzs.2d	v0, v0, #61
+# CHECK: fcvtzu.2s	v0, v0, #31
+# CHECK: fcvtzu.4s	v0, v0, #30
+# CHECK: fcvtzu.2d	v0, v0, #61
+# CHECK: rshrn.8b	v0, v0, #7
+# CHECK: rshrn2.16b	v0, v0, #6
+# CHECK: rshrn.4h	v0, v0, #13
+# CHECK: rshrn2.8h	v0, v0, #12
+# CHECK: rshrn.2s	v0, v0, #27
+# CHECK: rshrn2.4s	v0, v0, #26
+# CHECK: scvtf.2s	v0, v0, #31
+# CHECK: scvtf.4s	v0, v0, #30
+# CHECK: scvtf.2d	v0, v0, #61
+# CHECK: shl.8b	v0, v0, #1
+# CHECK: shl.16b	v0, v0, #2
+# CHECK: shl.4h	v0, v0, #3
+# CHECK: shl.8h	v0, v0, #4
+# CHECK: shl.2s	v0, v0, #5
+# CHECK: shl.4s	v0, v0, #6
+# CHECK: shl.2d	v0, v0, #7
+# CHECK: shrn.8b	v0, v0, #7
+# CHECK: shrn2.16b	v0, v0, #6
+# CHECK: shrn.4h	v0, v0, #13
+# CHECK: shrn2.8h	v0, v0, #12
+# CHECK: shrn.2s	v0, v0, #27
+# CHECK: shrn2.4s	v0, v0, #26
+# CHECK: sli.8b	v0, v0, #1
+# CHECK: sli.16b	v0, v0, #2
+# CHECK: sli.4h	v0, v0, #3
+# CHECK: sli.8h	v0, v0, #4
+# CHECK: sli.2s	v0, v0, #5
+# CHECK: sli.4s	v0, v0, #6
+# CHECK: sli.2d	v0, v0, #7
+# CHECK: sqrshrn.8b	v0, v0, #7
+# CHECK: sqrshrn2.16b	v0, v0, #6
+# CHECK: sqrshrn.4h	v0, v0, #13
+# CHECK: sqrshrn2.8h	v0, v0, #12
+# CHECK: sqrshrn.2s	v0, v0, #27
+# CHECK: sqrshrn2.4s	v0, v0, #26
+# CHECK: sqrshrun.8b	v0, v0, #7
+# CHECK: sqrshrun2.16b	v0, v0, #6
+# CHECK: sqrshrun.4h	v0, v0, #13
+# CHECK: sqrshrun2.8h	v0, v0, #12
+# CHECK: sqrshrun.2s	v0, v0, #27
+# CHECK: sqrshrun2.4s	v0, v0, #26
+# CHECK: sqshlu.8b	v0, v0, #1
+# CHECK: sqshlu.16b	v0, v0, #2
+# CHECK: sqshlu.4h	v0, v0, #3
+# CHECK: sqshlu.8h	v0, v0, #4
+# CHECK: sqshlu.2s	v0, v0, #5
+# CHECK: sqshlu.4s	v0, v0, #6
+# CHECK: sqshlu.2d	v0, v0, #7
+# CHECK: sqshl.8b	v0, v0, #1
+# CHECK: sqshl.16b	v0, v0, #2
+# CHECK: sqshl.4h	v0, v0, #3
+# CHECK: sqshl.8h	v0, v0, #4
+# CHECK: sqshl.2s	v0, v0, #5
+# CHECK: sqshl.4s	v0, v0, #6
+# CHECK: sqshl.2d	v0, v0, #7
+# CHECK: sqshrn.8b	v0, v0, #7
+# CHECK: sqshrn2.16b	v0, v0, #6
+# CHECK: sqshrn.4h	v0, v0, #13
+# CHECK: sqshrn2.8h	v0, v0, #12
+# CHECK: sqshrn.2s	v0, v0, #27
+# CHECK: sqshrn2.4s	v0, v0, #26
+# CHECK: sqshrun.8b	v0, v0, #7
+# CHECK: sqshrun2.16b	v0, v0, #6
+# CHECK: sqshrun.4h	v0, v0, #13
+# CHECK: sqshrun2.8h	v0, v0, #12
+# CHECK: sqshrun.2s	v0, v0, #27
+# CHECK: sqshrun2.4s	v0, v0, #26
+# CHECK: sri.8b	v0, v0, #7
+# CHECK: sri.16b	v0, v0, #6
+# CHECK: sri.4h	v0, v0, #13
+# CHECK: sri.8h	v0, v0, #12
+# CHECK: sri.2s	v0, v0, #27
+# CHECK: sri.4s	v0, v0, #26
+# CHECK: sri.2d	v0, v0, #57
+# CHECK: srshr.8b	v0, v0, #7
+# CHECK: srshr.16b	v0, v0, #6
+# CHECK: srshr.4h	v0, v0, #13
+# CHECK: srshr.8h	v0, v0, #12
+# CHECK: srshr.2s	v0, v0, #27
+# CHECK: srshr.4s	v0, v0, #26
+# CHECK: srshr.2d	v0, v0, #57
+# CHECK: srsra.8b	v0, v0, #7
+# CHECK: srsra.16b	v0, v0, #6
+# CHECK: srsra.4h	v0, v0, #13
+# CHECK: srsra.8h	v0, v0, #12
+# CHECK: srsra.2s	v0, v0, #27
+# CHECK: srsra.4s	v0, v0, #26
+# CHECK: srsra.2d	v0, v0, #57
+# CHECK: sshll.8h	v0, v0, #1
+# CHECK: sshll2.8h	v0, v0, #2
+# CHECK: sshll.4s	v0, v0, #3
+# CHECK: sshll2.4s	v0, v0, #4
+# CHECK: sshll.2d	v0, v0, #5
+# CHECK: sshll2.2d	v0, v0, #6
+# CHECK: sshr.8b	v0, v0, #7
+# CHECK: sshr.16b	v0, v0, #6
+# CHECK: sshr.4h	v0, v0, #13
+# CHECK: sshr.8h	v0, v0, #12
+# CHECK: sshr.2s	v0, v0, #27
+# CHECK: sshr.4s	v0, v0, #26
+# CHECK: sshr.2d	v0, v0, #57
+# CHECK: sshr.8b	v0, v0, #7
+# CHECK: ssra.16b	v0, v0, #6
+# CHECK: ssra.4h	v0, v0, #13
+# CHECK: ssra.8h	v0, v0, #12
+# CHECK: ssra.2s	v0, v0, #27
+# CHECK: ssra.4s	v0, v0, #26
+# CHECK: ssra.2d	v0, v0, #57
+# CHECK: ssra		d0, d0, #64
+# CHECK: ucvtf.2s	v0, v0, #31
+# CHECK: ucvtf.4s	v0, v0, #30
+# CHECK: ucvtf.2d	v0, v0, #61
+# CHECK: uqrshrn.8b	v0, v0, #7
+# CHECK: uqrshrn2.16b	v0, v0, #6
+# CHECK: uqrshrn.4h	v0, v0, #13
+# CHECK: uqrshrn2.8h	v0, v0, #12
+# CHECK: uqrshrn.2s	v0, v0, #27
+# CHECK: uqrshrn2.4s	v0, v0, #26
+# CHECK: uqshl.8b	v0, v0, #1
+# CHECK: uqshl.16b	v0, v0, #2
+# CHECK: uqshl.4h	v0, v0, #3
+# CHECK: uqshl.8h	v0, v0, #4
+# CHECK: uqshl.2s	v0, v0, #5
+# CHECK: uqshl.4s	v0, v0, #6
+# CHECK: uqshl.2d	v0, v0, #7
+# CHECK: uqshrn.8b	v0, v0, #7
+# CHECK: uqshrn2.16b	v0, v0, #6
+# CHECK: uqshrn.4h	v0, v0, #13
+# CHECK: uqshrn2.8h	v0, v0, #12
+# CHECK: uqshrn.2s	v0, v0, #27
+# CHECK: uqshrn2.4s	v0, v0, #26
+# CHECK: urshr.8b	v0, v0, #7
+# CHECK: urshr.16b	v0, v0, #6
+# CHECK: urshr.4h	v0, v0, #13
+# CHECK: urshr.8h	v0, v0, #12
+# CHECK: urshr.2s	v0, v0, #27
+# CHECK: urshr.4s	v0, v0, #26
+# CHECK: urshr.2d	v0, v0, #57
+# CHECK: ursra.8b	v0, v0, #7
+# CHECK: ursra.16b	v0, v0, #6
+# CHECK: ursra.4h	v0, v0, #13
+# CHECK: ursra.8h	v0, v0, #12
+# CHECK: ursra.2s	v0, v0, #27
+# CHECK: ursra.4s	v0, v0, #26
+# CHECK: ursra.2d	v0, v0, #57
+# CHECK: ushll.8h	v0, v0, #1
+# CHECK: ushll2.8h	v0, v0, #2
+# CHECK: ushll.4s	v0, v0, #3
+# CHECK: ushll2.4s	v0, v0, #4
+# CHECK: ushll.2d	v0, v0, #5
+# CHECK: ushll2.2d	v0, v0, #6
+# CHECK: ushr.8b	v0, v0, #7
+# CHECK: ushr.16b	v0, v0, #6
+# CHECK: ushr.4h	v0, v0, #13
+# CHECK: ushr.8h	v0, v0, #12
+# CHECK: ushr.2s	v0, v0, #27
+# CHECK: ushr.4s	v0, v0, #26
+# CHECK: ushr.2d	v0, v0, #57
+# CHECK: usra.8b	v0, v0, #7
+# CHECK: usra.16b	v0, v0, #6
+# CHECK: usra.4h	v0, v0, #13
+# CHECK: usra.8h	v0, v0, #12
+# CHECK: usra.2s	v0, v0, #27
+# CHECK: usra.4s	v0, v0, #26
+# CHECK: usra.2d	v0, v0, #57
+
+
+  0x00 0xe0 0x20 0x0e
+  0x00 0xe0 0x20 0x4e
+  0x00 0xe0 0xe0 0x0e
+  0x00 0xe0 0xe0 0x4e
+
+# CHECK: pmull.8h v0, v0, v0
+# CHECK: pmull2.8h v0, v0, v0
+# CHECK: pmull.1q v0, v0, v0
+# CHECK: pmull2.1q v0, v0, v0
+
+  0x41 0xd8 0x70 0x7e
+  0x83 0xd8 0x30 0x7e
+# CHECK: faddp.2d	d1, v2
+# CHECK: faddp.2s	s3, v4
+
+  0x82 0x60 0x01 0x4e
+  0x80 0x60 0x01 0x0e
+  0xa2 0x00 0x01 0x4e
+  0xa0 0x00 0x01 0x0e
+  0xa2 0x40 0x01 0x4e
+  0xa0 0x40 0x01 0x0e
+  0xc2 0x20 0x01 0x4e
+  0xc0 0x20 0x01 0x0e
+
+# CHECK: tbl.16b	v2, { v4, v5, v6, v7 }, v1
+# CHECK: tbl.8b	v0, { v4, v5, v6, v7 }, v1
+# CHECK: tbl.16b	v2, { v5 }, v1
+# CHECK: tbl.8b	v0, { v5 }, v1
+# CHECK: tbl.16b	v2, { v5, v6, v7 }, v1
+# CHECK: tbl.8b	v0, { v5, v6, v7 }, v1
+# CHECK: tbl.16b	v2, { v6, v7 }, v1
+# CHECK: tbl.8b	v0, { v6, v7 }, v1
+#
+  0x82 0x70 0x01 0x4e
+  0x80 0x70 0x01 0x0e
+  0xa2 0x10 0x01 0x4e
+  0xa0 0x10 0x01 0x0e
+  0xa2 0x50 0x01 0x4e
+  0xa0 0x50 0x01 0x0e
+  0xc2 0x30 0x01 0x4e
+  0xc0 0x30 0x01 0x0e
+
+# CHECK: tbx.16b	v2, { v4, v5, v6, v7 }, v1
+# CHECK: tbx.8b	v0, { v4, v5, v6, v7 }, v1
+# CHECK: tbx.16b	v2, { v5 }, v1
+# CHECK: tbx.8b	v0, { v5 }, v1
+# CHECK: tbx.16b	v2, { v5, v6, v7 }, v1
+# CHECK: tbx.8b	v0, { v5, v6, v7 }, v1
+# CHECK: tbx.16b	v2, { v6, v7 }, v1
+# CHECK: tbx.8b	v0, { v6, v7 }, v1
+#
+
+0x00 0x80 0x20 0x0e
+0x00 0x80 0x20 0x4e
+0x00 0x80 0xa0 0x0e
+0x00 0x80 0xa0 0x4e
+
+# CHECK: smlal.8h v0, v0, v0
+# CHECK: smlal2.8h v0, v0, v0
+# CHECK: smlal.2d v0, v0, v0
+# CHECK: smlal2.2d v0, v0, v0
+
+0x00 0x80 0x20 0x2e
+0x00 0x80 0x20 0x6e
+0x00 0x80 0xa0 0x2e
+0x00 0x80 0xa0 0x6e
+
+# CHECK: umlal.8h v0, v0, v0
+# CHECK: umlal2.8h v0, v0, v0
+# CHECK: umlal.2d v0, v0, v0
+# CHECK: umlal2.2d v0, v0, v0
+
+0x00 0x90 0x60 0x5e
+0x00 0x90 0xa0 0x5e
+0x00 0xb0 0x60 0x5e
+0x00 0xb0 0xa0 0x5e
+
+# CHECK: sqdmlal s0, h0, h0
+# CHECK: sqdmlal d0, s0, s0
+# CHECK: sqdmlsl s0, h0, h0
+# CHECK: sqdmlsl d0, s0, s0
+
+0xaa 0xc5 0xc7 0x4d
+0xaa 0xc9 0xc7 0x4d
+0xaa 0xc1 0xc7 0x4d
+
+# CHECK: ld1r.8h { v10 }, [x13], x7
+# CHECK: ld1r.4s { v10 }, [x13], x7
+# CHECK: ld1r.16b { v10 }, [x13], x7
+
+0x00 0xd0 0x60 0x5e
+0x00 0xd0 0xa0 0x5e
+# CHECK: sqdmull	s0, h0, h0
+# CHECK: sqdmull	d0, s0, s0
+
+0x00 0xd8 0xa1 0x7e
+0x00 0xd8 0xe1 0x7e
+
+# CHECK: frsqrte s0, s0
+# CHECK: frsqrte d0, d0
+
+0xca 0xcd 0xc7 0x4d
+0xea 0xc9 0xe7 0x4d
+0xea 0xe9 0xc7 0x4d
+0xea 0xe9 0xe7 0x4d
+# CHECK: ld1r.2d	{ v10 }, [x14], x7
+# CHECK: ld2r.4s	{ v10, v11 }, [x15], x7
+# CHECK: ld3r.4s	{ v10, v11, v12 }, [x15], x7
+# CHECK: ld4r.4s	{ v10, v11, v12, v13 }, [x15], x7
+
+#===-------------------------------------------------------------------------===
+# AdvSIMD scalar three same
+#===-------------------------------------------------------------------------===
+0x62 0xdc 0x21 0x5e
+# CHECK: fmulx	s2, s3, s1
+0x62 0xdc 0x61 0x5e
+# CHECK: fmulx	d2, d3, d1
+
+
+# rdar://12511369
+0xe8 0x6b 0xdf 0x4c
+# CHECK: ld1.4s	{ v8, v9, v10 }, [sp], #48
diff --git a/test/MC/Disassembler/ARM64/arithmetic.txt b/test/MC/Disassembler/ARM64/arithmetic.txt
new file mode 100644
index 0000000000..3981219ff3
--- /dev/null
+++ b/test/MC/Disassembler/ARM64/arithmetic.txt
@@ -0,0 +1,522 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+#==---------------------------------------------------------------------------==
+# Add/Subtract with carry/borrow
+#==---------------------------------------------------------------------------==
+
+0x41 0x00 0x03 0x1a
+0x41 0x00 0x03 0x9a
+0x85 0x00 0x03 0x3a
+0x85 0x00 0x03 0xba
+
+# CHECK: adc  w1, w2, w3
+# CHECK: adc  x1, x2, x3
+# CHECK: adcs w5, w4, w3
+# CHECK: adcs x5, x4, x3
+
+0x41 0x00 0x03 0x5a
+0x41 0x00 0x03 0xda
+0x41 0x00 0x03 0x7a
+0x41 0x00 0x03 0xfa
+
+# CHECK: sbc  w1, w2, w3
+# CHECK: sbc  x1, x2, x3
+# CHECK: sbcs w1, w2, w3
+# CHECK: sbcs x1, x2, x3
+
+#==---------------------------------------------------------------------------==
+# Add/Subtract with (optionally shifted) immediate
+#==---------------------------------------------------------------------------==
+
+0x83 0x00 0x10 0x11
+0x83 0x00 0x10 0x91
+
+# CHECK: add w3, w4, #1024
+# CHECK: add x3, x4, #1024
+
+0x83 0x00 0x50 0x11
+0x83 0x00 0x40 0x11
+0x83 0x00 0x50 0x91
+0x83 0x00 0x40 0x91
+0xff 0x83 0x00 0x91
+
+# CHECK: add w3, w4, #4194304
+# CHECK: add x3, x4, #4194304
+# CHECK: add x3, x4, #0, lsl #12
+# CHECK: add sp, sp, #32
+
+0x83 0x00 0x10 0x31
+0x83 0x00 0x50 0x31
+0x83 0x00 0x10 0xb1
+0x83 0x00 0x50 0xb1
+
+# CHECK: adds w3, w4, #1024
+# CHECK: adds w3, w4, #4194304
+# CHECK: adds x3, x4, #1024
+# CHECK: adds x3, x4, #4194304
+
+0x83 0x00 0x10 0x51
+0x83 0x00 0x50 0x51
+0x83 0x00 0x10 0xd1
+0x83 0x00 0x50 0xd1
+0xff 0x83 0x00 0xd1
+
+# CHECK: sub w3, w4, #1024
+# CHECK: sub w3, w4, #4194304
+# CHECK: sub x3, x4, #1024
+# CHECK: sub x3, x4, #4194304
+# CHECK: sub sp, sp, #32
+
+0x83 0x00 0x10 0x71
+0x83 0x00 0x50 0x71
+0x83 0x00 0x10 0xf1
+0x83 0x00 0x50 0xf1
+
+# CHECK: subs w3, w4, #1024
+# CHECK: subs w3, w4, #4194304
+# CHECK: subs x3, x4, #1024
+# CHECK: subs x3, x4, #4194304
+
+#==---------------------------------------------------------------------------==
+# Add/Subtract register with (optional) shift
+#==---------------------------------------------------------------------------==
+
+0xac 0x01 0x0e 0x0b
+0xac 0x01 0x0e 0x8b
+0xac 0x31 0x0e 0x0b
+0xac 0x31 0x0e 0x8b
+0xac 0xa9 0x4e 0x0b
+0xac 0xa9 0x4e 0x8b
+0xac 0x9d 0x8e 0x0b
+0xac 0x9d 0x8e 0x8b
+
+# CHECK: add w12, w13, w14
+# CHECK: add x12, x13, x14
+# CHECK: add w12, w13, w14, lsl #12
+# CHECK: add x12, x13, x14, lsl #12
+# CHECK: add w12, w13, w14, lsr #42
+# CHECK: add x12, x13, x14, lsr #42
+# CHECK: add w12, w13, w14, asr #39
+# CHECK: add x12, x13, x14, asr #39
+
+0xac 0x01 0x0e 0x4b
+0xac 0x01 0x0e 0xcb
+0xac 0x31 0x0e 0x4b
+0xac 0x31 0x0e 0xcb
+0xac 0xa9 0x4e 0x4b
+0xac 0xa9 0x4e 0xcb
+0xac 0x9d 0x8e 0x4b
+0xac 0x9d 0x8e 0xcb
+
+# CHECK: sub w12, w13, w14
+# CHECK: sub x12, x13, x14
+# CHECK: sub w12, w13, w14, lsl #12
+# CHECK: sub x12, x13, x14, lsl #12
+# CHECK: sub w12, w13, w14, lsr #42
+# CHECK: sub x12, x13, x14, lsr #42
+# CHECK: sub w12, w13, w14, asr #39
+# CHECK: sub x12, x13, x14, asr #39
+
+0xac 0x01 0x0e 0x2b
+0xac 0x01 0x0e 0xab
+0xac 0x31 0x0e 0x2b
+0xac 0x31 0x0e 0xab
+0xac 0xa9 0x4e 0x2b
+0xac 0xa9 0x4e 0xab
+0xac 0x9d 0x8e 0x2b
+0xac 0x9d 0x8e 0xab
+
+# CHECK: adds w12, w13, w14
+# CHECK: adds x12, x13, x14
+# CHECK: adds w12, w13, w14, lsl #12
+# CHECK: adds x12, x13, x14, lsl #12
+# CHECK: adds w12, w13, w14, lsr #42
+# CHECK: adds x12, x13, x14, lsr #42
+# CHECK: adds w12, w13, w14, asr #39
+# CHECK: adds x12, x13, x14, asr #39
+
+0xac 0x01 0x0e 0x6b
+0xac 0x01 0x0e 0xeb
+0xac 0x31 0x0e 0x6b
+0xac 0x31 0x0e 0xeb
+0xac 0xa9 0x4e 0x6b
+0xac 0xa9 0x4e 0xeb
+0xac 0x9d 0x8e 0x6b
+0xac 0x9d 0x8e 0xeb
+
+# CHECK: subs w12, w13, w14
+# CHECK: subs x12, x13, x14
+# CHECK: subs w12, w13, w14, lsl #12
+# CHECK: subs x12, x13, x14, lsl #12
+# CHECK: subs w12, w13, w14, lsr #42
+# CHECK: subs x12, x13, x14, lsr #42
+# CHECK: subs w12, w13, w14, asr #39
+# CHECK: subs x12, x13, x14, asr #39
+
+#==---------------------------------------------------------------------------==
+# Add/Subtract with (optional) extend
+#==---------------------------------------------------------------------------==
+
+0x41 0x00 0x23 0x0b
+0x41 0x20 0x23 0x0b
+0x41 0x40 0x23 0x0b
+0x41 0x60 0x23 0x0b
+0x41 0x80 0x23 0x0b
+0x41 0xa0 0x23 0x0b
+0x41 0xc0 0x23 0x0b
+0x41 0xe0 0x23 0x0b
+
+# CHECK: add w1, w2, w3, uxtb
+# CHECK: add w1, w2, w3, uxth
+# CHECK: add w1, w2, w3, uxtw
+# CHECK: add w1, w2, w3, uxtx
+# CHECK: add w1, w2, w3, sxtb
+# CHECK: add w1, w2, w3, sxth
+# CHECK: add w1, w2, w3, sxtw
+# CHECK: add w1, w2, w3, sxtx
+
+0x41 0x00 0x23 0x8b
+0x41 0x20 0x23 0x8b
+0x41 0x40 0x23 0x8b
+0x41 0x80 0x23 0x8b
+0x41 0xa0 0x23 0x8b
+0x41 0xc0 0x23 0x8b
+
+# CHECK: add x1, x2, w3, uxtb
+# CHECK: add x1, x2, w3, uxth
+# CHECK: add x1, x2, w3, uxtw
+# CHECK: add x1, x2, w3, sxtb
+# CHECK: add x1, x2, w3, sxth
+# CHECK: add x1, x2, w3, sxtw
+
+0xe1 0x43 0x23 0x0b
+0xe1 0x43 0x23 0x0b
+0x5f 0x60 0x23 0x8b
+0x5f 0x60 0x23 0x8b
+
+# CHECK: add w1, wsp, w3
+# CHECK: add w1, wsp, w3
+# CHECK: add sp, x2, x3
+# CHECK: add sp, x2, x3
+
+0x41 0x00 0x23 0x4b
+0x41 0x20 0x23 0x4b
+0x41 0x40 0x23 0x4b
+0x41 0x60 0x23 0x4b
+0x41 0x80 0x23 0x4b
+0x41 0xa0 0x23 0x4b
+0x41 0xc0 0x23 0x4b
+0x41 0xe0 0x23 0x4b
+
+# CHECK: sub w1, w2, w3, uxtb
+# CHECK: sub w1, w2, w3, uxth
+# CHECK: sub w1, w2, w3, uxtw
+# CHECK: sub w1, w2, w3, uxtx
+# CHECK: sub w1, w2, w3, sxtb
+# CHECK: sub w1, w2, w3, sxth
+# CHECK: sub w1, w2, w3, sxtw
+# CHECK: sub w1, w2, w3, sxtx
+
+0x41 0x00 0x23 0xcb
+0x41 0x20 0x23 0xcb
+0x41 0x40 0x23 0xcb
+0x41 0x80 0x23 0xcb
+0x41 0xa0 0x23 0xcb
+0x41 0xc0 0x23 0xcb
+
+# CHECK: sub x1, x2, w3, uxtb
+# CHECK: sub x1, x2, w3, uxth
+# CHECK: sub x1, x2, w3, uxtw
+# CHECK: sub x1, x2, w3, sxtb
+# CHECK: sub x1, x2, w3, sxth
+# CHECK: sub x1, x2, w3, sxtw
+
+0xe1 0x43 0x23 0x4b
+0xe1 0x43 0x23 0x4b
+0x5f 0x60 0x23 0xcb
+0x5f 0x60 0x23 0xcb
+
+# CHECK: sub w1, wsp, w3
+# CHECK: sub w1, wsp, w3
+# CHECK: sub sp, x2, x3
+# CHECK: sub sp, x2, x3
+
+0x41 0x00 0x23 0x2b
+0x41 0x20 0x23 0x2b
+0x41 0x40 0x23 0x2b
+0x41 0x60 0x23 0x2b
+0x41 0x80 0x23 0x2b
+0x41 0xa0 0x23 0x2b
+0x41 0xc0 0x23 0x2b
+0x41 0xe0 0x23 0x2b
+
+# CHECK: adds w1, w2, w3, uxtb
+# CHECK: adds w1, w2, w3, uxth
+# CHECK: adds w1, w2, w3, uxtw
+# CHECK: adds w1, w2, w3, uxtx
+# CHECK: adds w1, w2, w3, sxtb
+# CHECK: adds w1, w2, w3, sxth
+# CHECK: adds w1, w2, w3, sxtw
+# CHECK: adds w1, w2, w3, sxtx
+
+0x41 0x00 0x23 0xab
+0x41 0x20 0x23 0xab
+0x41 0x40 0x23 0xab
+0x41 0x80 0x23 0xab
+0x41 0xa0 0x23 0xab
+0x41 0xc0 0x23 0xab
+
+# CHECK: adds x1, x2, w3, uxtb
+# CHECK: adds x1, x2, w3, uxth
+# CHECK: adds x1, x2, w3, uxtw
+# CHECK: adds x1, x2, w3, sxtb
+# CHECK: adds x1, x2, w3, sxth
+# CHECK: adds x1, x2, w3, sxtw
+
+0xe1 0x43 0x23 0x2b
+0xe1 0x43 0x23 0x2b
+
+# CHECK: adds w1, wsp, w3
+# CHECK: adds w1, wsp, w3
+
+0x41 0x00 0x23 0x6b
+0x41 0x20 0x23 0x6b
+0x41 0x40 0x23 0x6b
+0x41 0x60 0x23 0x6b
+0x41 0x80 0x23 0x6b
+0x41 0xa0 0x23 0x6b
+0x41 0xc0 0x23 0x6b
+0x41 0xe0 0x23 0x6b
+
+# CHECK: subs w1, w2, w3, uxtb
+# CHECK: subs w1, w2, w3, uxth
+# CHECK: subs w1, w2, w3, uxtw
+# CHECK: subs w1, w2, w3, uxtx
+# CHECK: subs w1, w2, w3, sxtb
+# CHECK: subs w1, w2, w3, sxth
+# CHECK: subs w1, w2, w3, sxtw
+# CHECK: subs w1, w2, w3, sxtx
+
+0x41 0x00 0x23 0xeb
+0x41 0x20 0x23 0xeb
+0x41 0x40 0x23 0xeb
+0x41 0x80 0x23 0xeb
+0x41 0xa0 0x23 0xeb
+0x41 0xc0 0x23 0xeb
+
+# CHECK: subs x1, x2, w3, uxtb
+# CHECK: subs x1, x2, w3, uxth
+# CHECK: subs x1, x2, w3, uxtw
+# CHECK: subs x1, x2, w3, sxtb
+# CHECK: subs x1, x2, w3, sxth
+# CHECK: subs x1, x2, w3, sxtw
+
+0xe1 0x43 0x23 0x6b
+0xe1 0x43 0x23 0x6b
+
+# CHECK: subs w1, wsp, w3
+# CHECK: subs w1, wsp, w3
+
+0x1f 0x41 0x28 0xeb
+0x3f 0x41 0x28 0x6b
+0xff 0x43 0x28 0x6b
+0xff 0x43 0x28 0xeb
+
+# CHECK: cmp x8, w8, uxtw
+# CHECK: cmp w9, w8, uxtw
+# CHECK: cmp wsp, w8
+# CHECK: cmp sp, w8
+
+0x3f 0x41 0x28 0x4b
+0xe1 0x43 0x28 0x4b
+0xff 0x43 0x28 0x4b
+0x3f 0x41 0x28 0xcb
+0xe1 0x43 0x28 0xcb
+0xff 0x43 0x28 0xcb
+0xe1 0x43 0x28 0x6b
+0xe1 0x43 0x28 0xeb
+
+# CHECK: sub wsp, w9, w8
+# CHECK: sub w1, wsp, w8
+# CHECK: sub wsp, wsp, w8
+# CHECK: sub sp, x9, w8
+# CHECK: sub x1, sp, w8
+# CHECK: sub sp, sp, w8
+# CHECK: subs w1, wsp, w8
+# CHECK: subs x1, sp, w8
+
+#==---------------------------------------------------------------------------==
+# Signed/Unsigned divide
+#==---------------------------------------------------------------------------==
+
+0x41 0x0c 0xc3 0x1a
+0x41 0x0c 0xc3 0x9a
+0x41 0x08 0xc3 0x1a
+0x41 0x08 0xc3 0x9a
+
+# CHECK: sdiv w1, w2, w3
+# CHECK: sdiv x1, x2, x3
+# CHECK: udiv w1, w2, w3
+# CHECK: udiv x1, x2, x3
+
+#==---------------------------------------------------------------------------==
+# Variable shifts
+#==---------------------------------------------------------------------------==
+
+  0x41 0x28 0xc3 0x1a
+# CHECK: asrv w1, w2, w3
+  0x41 0x28 0xc3 0x9a
+# CHECK: asrv x1, x2, x3
+  0x41 0x20 0xc3 0x1a
+# CHECK: lslv w1, w2, w3
+  0x41 0x20 0xc3 0x9a
+# CHECK: lslv x1, x2, x3
+  0x41 0x24 0xc3 0x1a
+# CHECK: lsrv w1, w2, w3
+  0x41 0x24 0xc3 0x9a
+# CHECK: lsrv x1, x2, x3
+  0x41 0x2c 0xc3 0x1a
+# CHECK: rorv w1, w2, w3
+  0x41 0x2c 0xc3 0x9a
+# CHECK: rorv x1, x2, x3
+
+#==---------------------------------------------------------------------------==
+# One operand instructions
+#==---------------------------------------------------------------------------==
+
+  0x41 0x14 0xc0 0x5a
+# CHECK: cls w1, w2
+  0x41 0x14 0xc0 0xda
+# CHECK: cls x1, x2
+  0x41 0x10 0xc0 0x5a
+# CHECK: clz w1, w2
+  0x41 0x10 0xc0 0xda
+# CHECK: clz x1, x2
+  0x41 0x00 0xc0 0x5a
+# CHECK: rbit w1, w2
+  0x41 0x00 0xc0 0xda
+# CHECK: rbit x1, x2
+  0x41 0x08 0xc0 0x5a
+# CHECK: rev w1, w2
+  0x41 0x0c 0xc0 0xda
+# CHECK: rev x1, x2
+  0x41 0x04 0xc0 0x5a
+# CHECK: rev16 w1, w2
+  0x41 0x04 0xc0 0xda
+# CHECK: rev16 x1, x2
+  0x41 0x08 0xc0 0xda
+# CHECK: rev32 x1, x2
+
+#==---------------------------------------------------------------------------==
+# 6.6.1 Multiply-add instructions
+#==---------------------------------------------------------------------------==
+
+0x41 0x10 0x03 0x1b
+0x41 0x10 0x03 0x9b
+0x41 0x90 0x03 0x1b
+0x41 0x90 0x03 0x9b
+0x41 0x10 0x23 0x9b
+0x41 0x90 0x23 0x9b
+0x41 0x10 0xa3 0x9b
+0x41 0x90 0xa3 0x9b
+
+# CHECK: madd   w1, w2, w3, w4
+# CHECK: madd   x1, x2, x3, x4
+# CHECK: msub   w1, w2, w3, w4
+# CHECK: msub   x1, x2, x3, x4
+# CHECK: smaddl x1, w2, w3, x4
+# CHECK: smsubl x1, w2, w3, x4
+# CHECK: umaddl x1, w2, w3, x4
+# CHECK: umsubl x1, w2, w3, x4
+
+#==---------------------------------------------------------------------------==
+# Multiply-high instructions
+#==---------------------------------------------------------------------------==
+
+0x41 0x7c 0x43 0x9b
+0x41 0x7c 0xc3 0x9b
+
+# CHECK: smulh x1, x2, x3
+# CHECK: umulh x1, x2, x3
+
+#==---------------------------------------------------------------------------==
+# Move immediate instructions
+#==---------------------------------------------------------------------------==
+
+0x20 0x00 0x80 0x52
+0x20 0x00 0x80 0xd2
+0x20 0x00 0xa0 0x52
+0x20 0x00 0xa0 0xd2
+
+# CHECK: movz w0, #1
+# CHECK: movz x0, #1
+# CHECK: movz w0, #1, lsl #16
+# CHECK: movz x0, #1, lsl #16
+
+0x40 0x00 0x80 0x12
+0x40 0x00 0x80 0x92
+0x40 0x00 0xa0 0x12
+0x40 0x00 0xa0 0x92
+
+# CHECK: movn w0, #2
+# CHECK: movn x0, #2
+# CHECK: movn w0, #2, lsl #16
+# CHECK: movn x0, #2, lsl #16
+
+0x20 0x00 0x80 0x72
+0x20 0x00 0x80 0xf2
+0x20 0x00 0xa0 0x72
+0x20 0x00 0xa0 0xf2
+
+# CHECK: movk w0, #1
+# CHECK: movk x0, #1
+# CHECK: movk w0, #1, lsl #16
+# CHECK: movk x0, #1, lsl #16
+
+#==---------------------------------------------------------------------------==
+# Conditionally set flags instructions
+#==---------------------------------------------------------------------------==
+
+  0x1f 0x00 0x00 0x31
+# CHECK: cmn w0, #0
+  0x1f 0xfc 0x03 0xb1
+# CHECK: x0, #255
+
+  0x23 0x08 0x42 0x3a
+# CHECK: ccmn w1, #2, #3, eq
+  0x23 0x08 0x42 0xba
+# CHECK: ccmn x1, #2, #3, eq
+  0x23 0x08 0x42 0x7a
+# CHECK: ccmp w1, #2, #3, eq
+  0x23 0x08 0x42 0xfa
+# CHECK: ccmp x1, #2, #3, eq
+
+  0x23 0x00 0x42 0x3a
+# CHECK: ccmn w1, w2, #3, eq
+  0x23 0x00 0x42 0xba
+# CHECK: ccmn x1, x2, #3, eq
+  0x23 0x00 0x42 0x7a
+# CHECK: ccmp w1, w2, #3, eq
+  0x23 0x00 0x42 0xfa
+# CHECK: ccmp x1, x2, #3, eq
+
+#==---------------------------------------------------------------------------==
+# Conditional select instructions
+#==---------------------------------------------------------------------------==
+
+  0x41 0x00 0x83 0x1a
+# CHECK: csel w1, w2, w3, eq
+  0x41 0x00 0x83 0x9a
+# CHECK: csel x1, x2, x3, eq
+  0x41 0x04 0x83 0x1a
+# CHECK: csinc w1, w2, w3, eq
+  0x41 0x04 0x83 0x9a
+# CHECK: csinc x1, x2, x3, eq
+  0x41 0x00 0x83 0x5a
+# CHECK: csinv w1, w2, w3, eq
+  0x41 0x00 0x83 0xda
+# CHECK: csinv x1, x2, x3, eq
+  0x41 0x04 0x83 0x5a
+# CHECK: csneg w1, w2, w3, eq
+  0x41 0x04 0x83 0xda
+# CHECK: csneg x1, x2, x3, eq
diff --git a/test/MC/Disassembler/ARM64/bitfield.txt b/test/MC/Disassembler/ARM64/bitfield.txt
new file mode 100644
index 0000000000..99e7af1ea3
--- /dev/null
+++ b/test/MC/Disassembler/ARM64/bitfield.txt
@@ -0,0 +1,29 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+#==---------------------------------------------------------------------------==
+# 5.4.4 Bitfield Operations
+#==---------------------------------------------------------------------------==
+
+0x41 0x3c 0x01 0x33
+0x41 0x3c 0x41 0xb3
+0x41 0x3c 0x01 0x13
+0x41 0x3c 0x41 0x93
+0x41 0x3c 0x01 0x53
+0x41 0x3c 0x41 0xd3
+
+# CHECK: bfm  w1, w2, #1, #15
+# CHECK: bfm  x1, x2, #1, #15
+# CHECK: sbfm w1, w2, #1, #15
+# CHECK: sbfm x1, x2, #1, #15
+# CHECK: ubfm w1, w2, #1, #15
+# CHECK: ubfm x1, x2, #1, #15
+
+#==---------------------------------------------------------------------------==
+# 5.4.5 Extract (immediate)
+#==---------------------------------------------------------------------------==
+
+0x41 0x3c 0x83 0x13
+0x62 0x04 0xc4 0x93
+
+# CHECK: extr w1, w2, w3, #15
+# CHECK: extr x2, x3, x4, #1
diff --git a/test/MC/Disassembler/ARM64/branch.txt b/test/MC/Disassembler/ARM64/branch.txt
new file mode 100644
index 0000000000..c5b254b736
--- /dev/null
+++ b/test/MC/Disassembler/ARM64/branch.txt
@@ -0,0 +1,75 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+#-----------------------------------------------------------------------------
+# Unconditional branch (register) instructions.
+#-----------------------------------------------------------------------------
+
+  0xc0 0x03 0x5f 0xd6
+# CHECK: ret
+  0x20 0x00 0x5f 0xd6
+# CHECK: ret x1
+  0xe0 0x03 0xbf 0xd6
+# CHECK: drps
+  0xe0 0x03 0x9f 0xd6
+# CHECK: eret
+  0xa0 0x00 0x1f 0xd6
+# CHECK: br  x5
+  0x20 0x01 0x3f 0xd6
+# CHECK: blr x9
+  0x0B 0x00 0x18 0x37
+# CHECK: tbnz	w11, #3, #0
+
+#-----------------------------------------------------------------------------
+# Exception generation instructions.
+#-----------------------------------------------------------------------------
+
+  0x20 0x00 0x20 0xd4
+# CHECK: brk   #1
+  0x41 0x00 0xa0 0xd4
+# CHECK: dcps1 #2
+  0x62 0x00 0xa0 0xd4
+# CHECK: dcps2 #3
+  0x83 0x00 0xa0 0xd4
+# CHECK: dcps3 #4
+  0xa0 0x00 0x40 0xd4
+# CHECK: hlt   #5
+  0xc2 0x00 0x00 0xd4
+# CHECK: hvc   #6
+  0xe3 0x00 0x00 0xd4
+# CHECK: smc   #7
+  0x01 0x01 0x00 0xd4
+# CHECK: svc   #8
+
+#-----------------------------------------------------------------------------
+# PC-relative branches (both positive and negative displacement)
+#-----------------------------------------------------------------------------
+
+  0x07 0x00 0x00 0x14
+# CHECK: b #28
+  0x06 0x00 0x00 0x94
+# CHECK: bl #24
+  0xa1 0x00 0x00 0x54
+# CHECK: b.ne #20
+  0x80 0x00 0x08 0x36
+# CHECK: tbz w0, #1, #16
+  0xe1 0xff 0xf7 0x36
+# CHECK: tbz w1, #30, #-4
+  0x60 0x00 0x08 0x37
+# CHECK: tbnz w0, #1, #12
+  0x40 0x00 0x00 0xb4
+# CHECK: cbz x0, #8
+  0x20 0x00 0x00 0xb5
+# CHECK: cbnz x0, #4
+  0x1f 0x20 0x03 0xd5
+# CHECK: nop
+  0xff 0xff 0xff 0x17
+# CHECK: b #-4
+  0xc1 0xff 0xff 0x54
+# CHECK: b.ne #-8
+  0xa0 0xff 0x0f 0x36
+# CHECK: tbz w0, #1, #-12
+  0x80 0xff 0xff 0xb4
+# CHECK: cbz x0, #-16
+  0x1f 0x20 0x03 0xd5
+# CHECK: nop
+
diff --git a/test/MC/Disassembler/ARM64/crc32.txt b/test/MC/Disassembler/ARM64/crc32.txt
new file mode 100644
index 0000000000..ef0a26e562
--- /dev/null
+++ b/test/MC/Disassembler/ARM64/crc32.txt
@@ -0,0 +1,18 @@
+# RUN: llvm-mc -triple=arm64 -disassemble < %s | FileCheck %s
+
+# CHECK: crc32b  w5, w7, w20
+# CHECK: crc32h  w28, wzr, w30
+# CHECK: crc32w  w0, w1, w2
+# CHECK: crc32x  w7, w9, x20
+# CHECK: crc32cb w9, w5, w4
+# CHECK: crc32ch w13, w17, w25
+# CHECK: crc32cw wzr, w3, w5
+# CHECK: crc32cx w18, w16, xzr
+0xe5 0x40 0xd4 0x1a
+0xfc 0x47 0xde 0x1a
+0x20 0x48 0xc2 0x1a
+0x27 0x4d 0xd4 0x9a
+0xa9 0x50 0xc4 0x1a
+0x2d 0x56 0xd9 0x1a
+0x7f 0x58 0xc5 0x1a
+0x12 0x5e 0xdf 0x9a
diff --git a/test/MC/Disassembler/ARM64/crypto.txt b/test/MC/Disassembler/ARM64/crypto.txt
new file mode 100644
index 0000000000..e163b2cd59
--- /dev/null
+++ b/test/MC/Disassembler/ARM64/crypto.txt
@@ -0,0 +1,47 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple arm64-apple-darwin -output-asm-variant=1 --disassemble < %s | FileCheck %s --check-prefix=CHECK-APPLE
+
+  0x20 0x48 0x28 0x4e
+  0x20 0x58 0x28 0x4e
+  0x20 0x68 0x28 0x4e
+  0x20 0x78 0x28 0x4e
+  0x20 0x00 0x02 0x5e
+  0x20 0x10 0x02 0x5e
+  0x20 0x20 0x02 0x5e
+  0x20 0x30 0x02 0x5e
+  0x20 0x40 0x02 0x5e
+  0x20 0x50 0x02 0x5e
+  0x20 0x60 0x02 0x5e
+  0x20 0x08 0x28 0x5e
+  0x20 0x18 0x28 0x5e
+  0x20 0x28 0x28 0x5e
+
+# CHECK: aese v0.16b, v1.16b
+# CHECK: aesd v0.16b, v1.16b
+# CHECK: aesmc v0.16b, v1.16b
+# CHECK: aesimc v0.16b, v1.16b
+# CHECK: sha1c q0, s1, v2.4s
+# CHECK: sha1p q0, s1, v2.4s
+# CHECK: sha1m q0, s1, v2.4s
+# CHECK: sha1su0 v0.4s, v1.4s, v2
+# CHECK: sha256h q0, q1, v2.4s
+# CHECK: sha256h2 q0, q1, v2.4s
+# CHECK: sha256su1 v0.4s, v1.4s, v2.4s
+# CHECK: sha1h s0, s1
+# CHECK: sha1su1 v0.4s, v1.4s
+# CHECK: sha256su0 v0.4s, v1.4s
+
+# CHECK-APPLE: aese.16b v0, v1
+# CHECK-APPLE: aesd.16b v0, v1
+# CHECK-APPLE: aesmc.16b v0, v1
+# CHECK-APPLE: aesimc.16b v0, v1
+# CHECK-APPLE: sha1c.4s q0, s1, v2
+# CHECK-APPLE: sha1p.4s q0, s1, v2
+# CHECK-APPLE: sha1m.4s q0, s1, v2
+# CHECK-APPLE: sha1su0.4s v0, v1, v2
+# CHECK-APPLE: sha256h.4s q0, q1, v2
+# CHECK-APPLE: sha256h2.4s q0, q1, v2
+# CHECK-APPLE: sha256su1.4s v0, v1, v2
+# CHECK-APPLE: sha1h s0, s1
+# CHECK-APPLE: sha1su1.4s v0, v1
+# CHECK-APPLE: sha256su0.4s v0, v1
diff --git a/test/MC/Disassembler/ARM64/invalid-logical.txt b/test/MC/Disassembler/ARM64/invalid-logical.txt
new file mode 100644
index 0000000000..8a4ecb664e
--- /dev/null
+++ b/test/MC/Disassembler/ARM64/invalid-logical.txt
@@ -0,0 +1,6 @@
+# RUN: llvm-mc -triple arm64-apple-darwin -disassemble < %s 2>&1 | FileCheck %s
+
+# rdar://15226511
+0x7b 0xbf 0x25 0x72
+# CHECK: invalid instruction encoding
+# CHECK-NEXT: 0x7b 0xbf 0x25 0x72
diff --git a/test/MC/Disassembler/ARM64/lit.local.cfg b/test/MC/Disassembler/ARM64/lit.local.cfg
new file mode 100644
index 0000000000..46a946845e
--- /dev/null
+++ b/test/MC/Disassembler/ARM64/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.txt']
+
+targets = set(config.root.targets_to_build.split())
+if not 'ARM64' in targets:
+    config.unsupported = True
diff --git a/test/MC/Disassembler/ARM64/logical.txt b/test/MC/Disassembler/ARM64/logical.txt
new file mode 100644
index 0000000000..29db8cbcf4
--- /dev/null
+++ b/test/MC/Disassembler/ARM64/logical.txt
@@ -0,0 +1,217 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+#==---------------------------------------------------------------------------==
+# 5.4.2 Logical (immediate)
+#==---------------------------------------------------------------------------==
+
+0x00 0x00 0x00 0x12
+0x00 0x00 0x40 0x92
+0x41 0x0c 0x00 0x12
+0x41 0x0c 0x40 0x92
+0xbf 0xec 0x7c 0x92
+0x00 0x00 0x00 0x72
+0x00 0x00 0x40 0xf2
+0x41 0x0c 0x00 0x72
+0x41 0x0c 0x40 0xf2
+
+# CHECK: and  w0, w0, #0x1
+# CHECK: and  x0, x0, #0x1
+# CHECK: and  w1, w2, #0xf
+# CHECK: and  x1, x2, #0xf
+# CHECK: and  sp, x5, #0xfffffffffffffff0
+# CHECK: ands w0, w0, #0x1
+# CHECK: ands x0, x0, #0x1
+# CHECK: ands w1, w2, #0xf
+# CHECK: ands x1, x2, #0xf
+
+0x41 0x00 0x12 0x52
+0x41 0x00 0x71 0xd2
+
+# CHECK: eor w1, w2, #0x4000
+# CHECK: eor x1, x2, #0x8000
+
+0x41 0x00 0x12 0x32
+0x41 0x00 0x71 0xb2
+
+# CHECK: orr w1, w2, #0x4000
+# CHECK: orr x1, x2, #0x8000
+
+#==---------------------------------------------------------------------------==
+# 5.5.3 Logical (shifted register)
+#==---------------------------------------------------------------------------==
+
+0x41 0x00 0x03 0x0a
+0x41 0x00 0x03 0x8a
+0x41 0x08 0x03 0x0a
+0x41 0x08 0x03 0x8a
+0x41 0x08 0x43 0x0a
+0x41 0x08 0x43 0x8a
+0x41 0x08 0x83 0x0a
+0x41 0x08 0x83 0x8a
+0x41 0x08 0xc3 0x0a
+0x41 0x08 0xc3 0x8a
+
+# CHECK: and  w1, w2, w3
+# CHECK: and  x1, x2, x3
+# CHECK: and  w1, w2, w3, lsl #2
+# CHECK: and  x1, x2, x3, lsl #2
+# CHECK: and  w1, w2, w3, lsr #2
+# CHECK: and  x1, x2, x3, lsr #2
+# CHECK: and  w1, w2, w3, asr #2
+# CHECK: and  x1, x2, x3, asr #2
+# CHECK: and  w1, w2, w3, ror #2
+# CHECK: and  x1, x2, x3, ror #2
+
+0x41 0x00 0x03 0x6a
+0x41 0x00 0x03 0xea
+0x41 0x08 0x03 0x6a
+0x41 0x08 0x03 0xea
+0x41 0x08 0x43 0x6a
+0x41 0x08 0x43 0xea
+0x41 0x08 0x83 0x6a
+0x41 0x08 0x83 0xea
+0x41 0x08 0xc3 0x6a
+0x41 0x08 0xc3 0xea
+
+# CHECK: ands w1, w2, w3
+# CHECK: ands x1, x2, x3
+# CHECK: ands w1, w2, w3, lsl #2
+# CHECK: ands x1, x2, x3, lsl #2
+# CHECK: ands w1, w2, w3, lsr #2
+# CHECK: ands x1, x2, x3, lsr #2
+# CHECK: ands w1, w2, w3, asr #2
+# CHECK: ands x1, x2, x3, asr #2
+# CHECK: ands w1, w2, w3, ror #2
+# CHECK: ands x1, x2, x3, ror #2
+
+0x41 0x00 0x23 0x0a
+0x41 0x00 0x23 0x8a
+0x41 0x0c 0x23 0x0a
+0x41 0x0c 0x23 0x8a
+0x41 0x0c 0x63 0x0a
+0x41 0x0c 0x63 0x8a
+0x41 0x0c 0xa3 0x0a
+0x41 0x0c 0xa3 0x8a
+0x41 0x0c 0xe3 0x0a
+0x41 0x0c 0xe3 0x8a
+
+# CHECK: bic w1, w2, w3
+# CHECK: bic x1, x2, x3
+# CHECK: bic w1, w2, w3, lsl #3
+# CHECK: bic x1, x2, x3, lsl #3
+# CHECK: bic w1, w2, w3, lsr #3
+# CHECK: bic x1, x2, x3, lsr #3
+# CHECK: bic w1, w2, w3, asr #3
+# CHECK: bic x1, x2, x3, asr #3
+# CHECK: bic w1, w2, w3, ror #3
+# CHECK: bic x1, x2, x3, ror #3
+
+0x41 0x00 0x23 0x6a
+0x41 0x00 0x23 0xea
+0x41 0x0c 0x23 0x6a
+0x41 0x0c 0x23 0xea
+0x41 0x0c 0x63 0x6a
+0x41 0x0c 0x63 0xea
+0x41 0x0c 0xa3 0x6a
+0x41 0x0c 0xa3 0xea
+0x41 0x0c 0xe3 0x6a
+0x41 0x0c 0xe3 0xea
+
+# CHECK: bics w1, w2, w3
+# CHECK: bics x1, x2, x3
+# CHECK: bics w1, w2, w3, lsl #3
+# CHECK: bics x1, x2, x3, lsl #3
+# CHECK: bics w1, w2, w3, lsr #3
+# CHECK: bics x1, x2, x3, lsr #3
+# CHECK: bics w1, w2, w3, asr #3
+# CHECK: bics x1, x2, x3, asr #3
+# CHECK: bics w1, w2, w3, ror #3
+# CHECK: bics x1, x2, x3, ror #3
+
+0x41 0x00 0x23 0x4a
+0x41 0x00 0x23 0xca
+0x41 0x10 0x23 0x4a
+0x41 0x10 0x23 0xca
+0x41 0x10 0x63 0x4a
+0x41 0x10 0x63 0xca
+0x41 0x10 0xa3 0x4a
+0x41 0x10 0xa3 0xca
+0x41 0x10 0xe3 0x4a
+0x41 0x10 0xe3 0xca
+
+# CHECK: eon w1, w2, w3
+# CHECK: eon x1, x2, x3
+# CHECK: eon w1, w2, w3, lsl #4
+# CHECK: eon x1, x2, x3, lsl #4
+# CHECK: eon w1, w2, w3, lsr #4
+# CHECK: eon x1, x2, x3, lsr #4
+# CHECK: eon w1, w2, w3, asr #4
+# CHECK: eon x1, x2, x3, asr #4
+# CHECK: eon w1, w2, w3, ror #4
+# CHECK: eon x1, x2, x3, ror #4
+
+0x41 0x00 0x03 0x4a
+0x41 0x00 0x03 0xca
+0x41 0x14 0x03 0x4a
+0x41 0x14 0x03 0xca
+0x41 0x14 0x43 0x4a
+0x41 0x14 0x43 0xca
+0x41 0x14 0x83 0x4a
+0x41 0x14 0x83 0xca
+0x41 0x14 0xc3 0x4a
+0x41 0x14 0xc3 0xca
+
+# CHECK: eor w1, w2, w3
+# CHECK: eor x1, x2, x3
+# CHECK: eor w1, w2, w3, lsl #5
+# CHECK: eor x1, x2, x3, lsl #5
+# CHECK: eor w1, w2, w3, lsr #5
+# CHECK: eor x1, x2, x3, lsr #5
+# CHECK: eor w1, w2, w3, asr #5
+# CHECK: eor x1, x2, x3, asr #5
+# CHECK: eor w1, w2, w3, ror #5
+# CHECK: eor x1, x2, x3, ror #5
+
+0x41 0x00 0x03 0x2a
+0x41 0x00 0x03 0xaa
+0x41 0x18 0x03 0x2a
+0x41 0x18 0x03 0xaa
+0x41 0x18 0x43 0x2a
+0x41 0x18 0x43 0xaa
+0x41 0x18 0x83 0x2a
+0x41 0x18 0x83 0xaa
+0x41 0x18 0xc3 0x2a
+0x41 0x18 0xc3 0xaa
+
+# CHECK: orr w1, w2, w3
+# CHECK: orr x1, x2, x3
+# CHECK: orr w1, w2, w3, lsl #6
+# CHECK: orr x1, x2, x3, lsl #6
+# CHECK: orr w1, w2, w3, lsr #6
+# CHECK: orr x1, x2, x3, lsr #6
+# CHECK: orr w1, w2, w3, asr #6
+# CHECK: orr x1, x2, x3, asr #6
+# CHECK: orr w1, w2, w3, ror #6
+# CHECK: orr x1, x2, x3, ror #6
+
+0x41 0x00 0x23 0x2a
+0x41 0x00 0x23 0xaa
+0x41 0x1c 0x23 0x2a
+0x41 0x1c 0x23 0xaa
+0x41 0x1c 0x63 0x2a
+0x41 0x1c 0x63 0xaa
+0x41 0x1c 0xa3 0x2a
+0x41 0x1c 0xa3 0xaa
+0x41 0x1c 0xe3 0x2a
+0x41 0x1c 0xe3 0xaa
+
+# CHECK: orn w1, w2, w3
+# CHECK: orn x1, x2, x3
+# CHECK: orn w1, w2, w3, lsl #7
+# CHECK: orn x1, x2, x3, lsl #7
+# CHECK: orn w1, w2, w3, lsr #7
+# CHECK: orn x1, x2, x3, lsr #7
+# CHECK: orn w1, w2, w3, asr #7
+# CHECK: orn x1, x2, x3, asr #7
+# CHECK: orn w1, w2, w3, ror #7
+# CHECK: orn x1, x2, x3, ror #7
diff --git a/test/MC/Disassembler/ARM64/memory.txt b/test/MC/Disassembler/ARM64/memory.txt
new file mode 100644
index 0000000000..031bfa6903
--- /dev/null
+++ b/test/MC/Disassembler/ARM64/memory.txt
@@ -0,0 +1,558 @@
+# RUN: llvm-mc --disassemble -triple arm64-apple-darwin < %s | FileCheck %s
+
+#-----------------------------------------------------------------------------
+# Indexed loads
+#-----------------------------------------------------------------------------
+
+  0x85 0x14 0x40 0xb9
+  0x64 0x00 0x40 0xf9
+  0xe2 0x13 0x40 0xf9
+  0xe5 0x07 0x40 0x3d
+  0xe6 0x07 0x40 0x7d
+  0xe7 0x07 0x40 0xbd
+  0xe8 0x07 0x40 0xfd
+  0xe9 0x07 0xc0 0x3d
+  0x64 0x00 0x40 0x39
+  0x20 0x78 0xa0 0xb8
+  0x85 0x50 0x40 0x39
+
+# CHECK: ldr	w5, [x4, #20]
+# CHECK: ldr	x4, [x3]
+# CHECK: ldr	x2, [sp, #32]
+# CHECK: ldr	b5, [sp, #1]
+# CHECK: ldr	h6, [sp, #2]
+# CHECK: ldr	s7, [sp, #4]
+# CHECK: ldr	d8, [sp, #8]
+# CHECK: ldr	q9, [sp, #16]
+# CHECK: ldrb	w4, [x3]
+# CHECK: ldrsw	x0, [x1, x0, lsl #2]
+# CHECK: ldrb	w5, [x4, #20]
+# CHECK: ldrsb	w9, [x3]
+# CHECK: ldrsb	x2, [sp, #128]
+# CHECK: ldrh	w2, [sp, #32]
+# CHECK: ldrsh	w3, [sp, #32]
+# CHECK: ldrsh	x5, [x9, #24]
+# CHECK: ldrsw	x9, [sp, #512]
+# CHECK: prfm	pldl3strm, [sp, #32]
+
+  0x69 0x00 0xc0 0x39
+  0xe2 0x03 0x82 0x39
+  0xe2 0x43 0x40 0x79
+  0xe3 0x43 0xc0 0x79
+  0x25 0x31 0x80 0x79
+  0xe9 0x03 0x82 0xb9
+  0xe5 0x13 0x80 0xf9
+  0x40 0x00 0x80 0xf9
+  0x41 0x00 0x80 0xf9
+  0x42 0x00 0x80 0xf9
+  0x43 0x00 0x80 0xf9
+  0x44 0x00 0x80 0xf9
+  0x45 0x00 0x80 0xf9
+  0x50 0x00 0x80 0xf9
+  0x51 0x00 0x80 0xf9
+  0x52 0x00 0x80 0xf9
+  0x53 0x00 0x80 0xf9
+  0x54 0x00 0x80 0xf9
+  0x55 0x00 0x80 0xf9
+
+# CHECK: prfm	pldl1keep, [x2]
+# CHECK: prfm	pldl1strm, [x2]
+# CHECK: prfm	pldl2keep, [x2]
+# CHECK: prfm	pldl2strm, [x2]
+# CHECK: prfm	pldl3keep, [x2]
+# CHECK: prfm	pldl3strm, [x2]
+# CHECK: prfm	pstl1keep, [x2]
+# CHECK: prfm	pstl1strm, [x2]
+# CHECK: prfm	pstl2keep, [x2]
+# CHECK: prfm	pstl2strm, [x2]
+# CHECK: prfm	pstl3keep, [x2]
+# CHECK: prfm	pstl3strm, [x2]
+
+#-----------------------------------------------------------------------------
+# Indexed stores
+#-----------------------------------------------------------------------------
+
+  0x64 0x00 0x00 0xf9
+  0xe2 0x13 0x00 0xf9
+  0x85 0x14 0x00 0xb9
+  0xe5 0x07 0x00 0x3d
+  0xe6 0x07 0x00 0x7d
+  0xe7 0x07 0x00 0xbd
+  0xe8 0x07 0x00 0xfd
+  0xe9 0x07 0x80 0x3d
+  0x64 0x00 0x00 0x39
+  0x85 0x50 0x00 0x39
+  0xe2 0x43 0x00 0x79
+
+# CHECK: str	x4, [x3]
+# CHECK: str	x2, [sp, #32]
+# CHECK: str	w5, [x4, #20]
+# CHECK: str	b5, [sp, #1]
+# CHECK: str	h6, [sp, #2]
+# CHECK: str	s7, [sp, #4]
+# CHECK: str	d8, [sp, #8]
+# CHECK: str	q9, [sp, #16]
+# CHECK: strb	w4, [x3]
+# CHECK: strb	w5, [x4, #20]
+# CHECK: strh	w2, [sp, #32]
+
+#-----------------------------------------------------------------------------
+# Unscaled immediate loads and stores
+#-----------------------------------------------------------------------------
+
+  0x62 0x00 0x40 0xb8
+  0xe2 0x83 0x41 0xb8
+  0x62 0x00 0x40 0xf8
+  0xe2 0x83 0x41 0xf8
+  0xe5 0x13 0x40 0x3c
+  0xe6 0x23 0x40 0x7c
+  0xe7 0x43 0x40 0xbc
+  0xe8 0x83 0x40 0xfc
+  0xe9 0x03 0xc1 0x3c
+  0x69 0x00 0xc0 0x38
+  0xe2 0x03 0x88 0x38
+  0xe3 0x03 0xc2 0x78
+  0x25 0x81 0x81 0x78
+  0xe9 0x03 0x98 0xb8
+
+# CHECK: ldur	w2, [x3]
+# CHECK: ldur	w2, [sp, #24]
+# CHECK: ldur	x2, [x3]
+# CHECK: ldur	x2, [sp, #24]
+# CHECK: ldur	b5, [sp, #1]
+# CHECK: ldur	h6, [sp, #2]
+# CHECK: ldur	s7, [sp, #4]
+# CHECK: ldur	d8, [sp, #8]
+# CHECK: ldur	q9, [sp, #16]
+# CHECK: ldursb	w9, [x3]
+# CHECK: ldursb	x2, [sp, #128]
+# CHECK: ldursh	w3, [sp, #32]
+# CHECK: ldursh	x5, [x9, #24]
+# CHECK: ldursw	x9, [sp, #-128]
+
+  0x64 0x00 0x00 0xb8
+  0xe2 0x03 0x02 0xb8
+  0x64 0x00 0x00 0xf8
+  0xe2 0x03 0x02 0xf8
+  0x85 0x40 0x01 0xb8
+  0xe5 0x13 0x00 0x3c
+  0xe6 0x23 0x00 0x7c
+  0xe7 0x43 0x00 0xbc
+  0xe8 0x83 0x00 0xfc
+  0xe9 0x03 0x81 0x3c
+  0x64 0x00 0x00 0x38
+  0x85 0x40 0x01 0x38
+  0xe2 0x03 0x02 0x78
+  0xe5 0x03 0x82 0xf8
+
+# CHECK: stur	w4, [x3]
+# CHECK: stur	w2, [sp, #32]
+# CHECK: stur	x4, [x3]
+# CHECK: stur	x2, [sp, #32]
+# CHECK: stur	w5, [x4, #20]
+# CHECK: stur	b5, [sp, #1]
+# CHECK: stur	h6, [sp, #2]
+# CHECK: stur	s7, [sp, #4]
+# CHECK: stur	d8, [sp, #8]
+# CHECK: stur	q9, [sp, #16]
+# CHECK: sturb	w4, [x3]
+# CHECK: sturb	w5, [x4, #20]
+# CHECK: sturh	w2, [sp, #32]
+# CHECK: prfum	pldl3strm, [sp, #32]
+
+#-----------------------------------------------------------------------------
+# Unprivileged loads and stores
+#-----------------------------------------------------------------------------
+
+  0x83 0x08 0x41 0xb8
+  0x83 0x08 0x41 0xf8
+  0x83 0x08 0x41 0x38
+  0x69 0x08 0xc0 0x38
+  0xe2 0x0b 0x88 0x38
+  0x83 0x08 0x41 0x78
+  0xe3 0x0b 0xc2 0x78
+  0x25 0x89 0x81 0x78
+  0xe9 0x0b 0x98 0xb8
+
+# CHECK: ldtr	w3, [x4, #16]
+# CHECK: ldtr	x3, [x4, #16]
+# CHECK: ldtrb	w3, [x4, #16]
+# CHECK: ldtrsb	w9, [x3]
+# CHECK: ldtrsb	x2, [sp, #128]
+# CHECK: ldtrh	w3, [x4, #16]
+# CHECK: ldtrsh	w3, [sp, #32]
+# CHECK: ldtrsh	x5, [x9, #24]
+# CHECK: ldtrsw	x9, [sp, #-128]
+
+  0x85 0x48 0x01 0xb8
+  0x64 0x08 0x00 0xf8
+  0xe2 0x0b 0x02 0xf8
+  0x64 0x08 0x00 0x38
+  0x85 0x48 0x01 0x38
+  0xe2 0x0b 0x02 0x78
+
+# CHECK: sttr	w5, [x4, #20]
+# CHECK: sttr	x4, [x3]
+# CHECK: sttr	x2, [sp, #32]
+# CHECK: sttrb	w4, [x3]
+# CHECK: sttrb	w5, [x4, #20]
+# CHECK: sttrh	w2, [sp, #32]
+
+#-----------------------------------------------------------------------------
+# Pre-indexed loads and stores
+#-----------------------------------------------------------------------------
+
+  0xfd 0x8c 0x40 0xf8
+  0xfe 0x8c 0x40 0xf8
+  0x05 0x1c 0x40 0x3c
+  0x06 0x2c 0x40 0x7c
+  0x07 0x4c 0x40 0xbc
+  0x08 0x8c 0x40 0xfc
+  0x09 0x0c 0xc1 0x3c
+
+# CHECK: ldr	fp, [x7, #8]!
+# CHECK: ldr	lr, [x7, #8]!
+# CHECK: ldr	b5, [x0, #1]!
+# CHECK: ldr	h6, [x0, #2]!
+# CHECK: ldr	s7, [x0, #4]!
+# CHECK: ldr	d8, [x0, #8]!
+# CHECK: ldr	q9, [x0, #16]!
+
+  0xfe 0x8c 0x1f 0xf8
+  0xfd 0x8c 0x1f 0xf8
+  0x05 0xfc 0x1f 0x3c
+  0x06 0xec 0x1f 0x7c
+  0x07 0xcc 0x1f 0xbc
+  0x08 0x8c 0x1f 0xfc
+  0x09 0x0c 0x9f 0x3c
+
+# CHECK: str	lr, [x7, #-8]!
+# CHECK: str	fp, [x7, #-8]!
+# CHECK: str	b5, [x0, #-1]!
+# CHECK: str	h6, [x0, #-2]!
+# CHECK: str	s7, [x0, #-4]!
+# CHECK: str	d8, [x0, #-8]!
+# CHECK: str	q9, [x0, #-16]!
+
+#-----------------------------------------------------------------------------
+# post-indexed loads and stores
+#-----------------------------------------------------------------------------
+
+  0xfe 0x84 0x1f 0xf8
+  0xfd 0x84 0x1f 0xf8
+  0x05 0xf4 0x1f 0x3c
+  0x06 0xe4 0x1f 0x7c
+  0x07 0xc4 0x1f 0xbc
+  0x08 0x84 0x1f 0xfc
+  0x09 0x04 0x9f 0x3c
+
+# CHECK: str	lr, [x7], #-8
+# CHECK: str	fp, [x7], #-8
+# CHECK: str	b5, [x0], #-1
+# CHECK: str	h6, [x0], #-2
+# CHECK: str	s7, [x0], #-4
+# CHECK: str	d8, [x0], #-8
+# CHECK: str	q9, [x0], #-16
+
+  0xfd 0x84 0x40 0xf8
+  0xfe 0x84 0x40 0xf8
+  0x05 0x14 0x40 0x3c
+  0x06 0x24 0x40 0x7c
+  0x07 0x44 0x40 0xbc
+  0x08 0x84 0x40 0xfc
+  0x09 0x04 0xc1 0x3c
+
+# CHECK: ldr	fp, [x7], #8
+# CHECK: ldr	lr, [x7], #8
+# CHECK: ldr	b5, [x0], #1
+# CHECK: ldr	h6, [x0], #2
+# CHECK: ldr	s7, [x0], #4
+# CHECK: ldr	d8, [x0], #8
+# CHECK: ldr	q9, [x0], #16
+
+#-----------------------------------------------------------------------------
+# Load/Store pair (indexed  offset)
+#-----------------------------------------------------------------------------
+
+  0xe3 0x09 0x42 0x29
+  0xe4 0x27 0x7f 0xa9
+  0xc2 0x0d 0x42 0x69
+  0xe2 0x0f 0x7e 0x69
+  0x4a 0x04 0x48 0x2d
+  0x4a 0x04 0x40 0x6d
+
+# CHECK: ldp	w3, w2, [x15, #16]
+# CHECK: ldp	x4, x9, [sp, #-16]
+# CHECK: ldpsw	x2, x3, [x14, #16]
+# CHECK: ldpsw	x2, x3, [sp, #-16]
+# CHECK: ldp	s10, s1, [x2, #64]
+# CHECK: ldp	d10, d1, [x2]
+
+  0xe3 0x09 0x02 0x29
+  0xe4 0x27 0x3f 0xa9
+  0x4a 0x04 0x08 0x2d
+  0x4a 0x04 0x00 0x6d
+
+# CHECK: stp	w3, w2, [x15, #16]
+# CHECK: stp	x4, x9, [sp, #-16]
+# CHECK: stp	s10, s1, [x2, #64]
+# CHECK: stp	d10, d1, [x2]
+
+#-----------------------------------------------------------------------------
+# Load/Store pair (pre-indexed)
+#-----------------------------------------------------------------------------
+
+  0xe3 0x09 0xc2 0x29
+  0xe4 0x27 0xff 0xa9
+  0xc2 0x0d 0xc2 0x69
+  0xe2 0x0f 0xfe 0x69
+  0x4a 0x04 0xc8 0x2d
+  0x4a 0x04 0xc1 0x6d
+
+# CHECK: ldp	w3, w2, [x15, #16]!
+# CHECK: ldp	x4, x9, [sp, #-16]!
+# CHECK: ldpsw	x2, x3, [x14, #16]!
+# CHECK: ldpsw	x2, x3, [sp, #-16]!
+# CHECK: ldp	s10, s1, [x2, #64]!
+# CHECK: ldp	d10, d1, [x2, #16]!
+
+  0xe3 0x09 0x82 0x29
+  0xe4 0x27 0xbf 0xa9
+  0x4a 0x04 0x88 0x2d
+  0x4a 0x04 0x81 0x6d
+
+# CHECK: stp	w3, w2, [x15, #16]!
+# CHECK: stp	x4, x9, [sp, #-16]!
+# CHECK: stp	s10, s1, [x2, #64]!
+# CHECK: stp	d10, d1, [x2, #16]!
+
+#-----------------------------------------------------------------------------
+# Load/Store pair (post-indexed)
+#-----------------------------------------------------------------------------
+
+  0xe3 0x09 0xc2 0x28
+  0xe4 0x27 0xff 0xa8
+  0xc2 0x0d 0xc2 0x68
+  0xe2 0x0f 0xfe 0x68
+  0x4a 0x04 0xc8 0x2c
+  0x4a 0x04 0xc1 0x6c
+
+# CHECK: ldp	w3, w2, [x15], #16
+# CHECK: ldp	x4, x9, [sp], #-16
+# CHECK: ldpsw	x2, x3, [x14], #16
+# CHECK: ldpsw	x2, x3, [sp], #-16
+# CHECK: ldp	s10, s1, [x2], #64
+# CHECK: ldp	d10, d1, [x2], #16
+
+  0xe3 0x09 0x82 0x28
+  0xe4 0x27 0xbf 0xa8
+  0x4a 0x04 0x88 0x2c
+  0x4a 0x04 0x81 0x6c
+
+# CHECK: stp	w3, w2, [x15], #16
+# CHECK: stp	x4, x9, [sp], #-16
+# CHECK: stp	s10, s1, [x2], #64
+# CHECK: stp	d10, d1, [x2], #16
+
+#-----------------------------------------------------------------------------
+# Load/Store pair (no-allocate)
+#-----------------------------------------------------------------------------
+
+  0xe3 0x09 0x42 0x28
+  0xe4 0x27 0x7f 0xa8
+  0x4a 0x04 0x48 0x2c
+  0x4a 0x04 0x40 0x6c
+
+# CHECK: ldnp	w3, w2, [x15, #16]
+# CHECK: ldnp	x4, x9, [sp, #-16]
+# CHECK: ldnp	s10, s1, [x2, #64]
+# CHECK: ldnp	d10, d1, [x2]
+
+  0xe3 0x09 0x02 0x28
+  0xe4 0x27 0x3f 0xa8
+  0x4a 0x04 0x08 0x2c
+  0x4a 0x04 0x00 0x6c
+
+# CHECK: stnp	w3, w2, [x15, #16]
+# CHECK: stnp	x4, x9, [sp, #-16]
+# CHECK: stnp	s10, s1, [x2, #64]
+# CHECK: stnp	d10, d1, [x2]
+
+#-----------------------------------------------------------------------------
+# Load/Store register offset
+#-----------------------------------------------------------------------------
+
+  0x00 0x68 0x60 0xb8
+  0x00 0x78 0x60 0xb8
+  0x00 0x68 0x60 0xf8
+  0x00 0x78 0x60 0xf8
+  0x00 0xe8 0x60 0xf8
+
+# CHECK: ldr	w0, [x0, x0]
+# CHECK: ldr	w0, [x0, x0, lsl #2]
+# CHECK: ldr	x0, [x0, x0]
+# CHECK: ldr	x0, [x0, x0, lsl #3]
+# CHECK: ldr	x0, [x0, x0, sxtx]
+
+  0x21 0x68 0x62 0x3c
+  0x21 0x78 0x62 0x3c
+  0x21 0x68 0x62 0x7c
+  0x21 0x78 0x62 0x7c
+  0x21 0x68 0x62 0xbc
+  0x21 0x78 0x62 0xbc
+  0x21 0x68 0x62 0xfc
+  0x21 0x78 0x62 0xfc
+  0x21 0x68 0xe2 0x3c
+  0x21 0x78 0xe2 0x3c
+
+# CHECK: ldr	b1, [x1, x2]
+# CHECK: ldr	b1, [x1, x2, lsl #0]
+# CHECK: ldr	h1, [x1, x2]
+# CHECK: ldr	h1, [x1, x2, lsl #1]
+# CHECK: ldr	s1, [x1, x2]
+# CHECK: ldr	s1, [x1, x2, lsl #2]
+# CHECK: ldr	d1, [x1, x2]
+# CHECK: ldr	d1, [x1, x2, lsl #3]
+# CHECK: ldr	q1, [x1, x2]
+# CHECK: ldr	q1, [x1, x2, lsl #4]
+
+  0xe1 0x6b 0x23 0xfc
+  0xe1 0x5b 0x23 0xfc
+  0xe1 0x6b 0xa3 0x3c
+  0xe1 0x5b 0xa3 0x3c
+
+# CHECK: str	d1, [sp, x3]
+# CHECK: str	d1, [sp, x3, uxtw #3]
+# CHECK: str	q1, [sp, x3]
+# CHECK: str	q1, [sp, x3, uxtw #4]
+
+#-----------------------------------------------------------------------------
+# Load/Store exclusive
+#-----------------------------------------------------------------------------
+
+  0x26 0x7c 0x5f 0x08
+  0x26 0x7c 0x5f 0x48
+  0x27 0x0d 0x7f 0x88
+  0x27 0x0d 0x7f 0xc8
+
+# CHECK: ldxrb	w6, [x1]
+# CHECK: ldxrh	w6, [x1]
+# CHECK: ldxp	w7, w3, [x9]
+# CHECK: ldxp	x7, x3, [x9]
+
+  0x64 0x7c 0x01 0xc8
+  0x64 0x7c 0x01 0x88
+  0x64 0x7c 0x01 0x08
+  0x64 0x7c 0x01 0x48
+  0x22 0x18 0x21 0xc8
+  0x22 0x18 0x21 0x88
+
+# CHECK: stxr	w1, x4, [x3]
+# CHECK: stxr	w1, w4, [x3]
+# CHECK: stxrb	w1, w4, [x3]
+# CHECK: stxrh	w1, w4, [x3]
+# CHECK: stxp	w1, x2, x6, [x1]
+# CHECK: stxp	w1, w2, w6, [x1]
+
+#-----------------------------------------------------------------------------
+# Load-acquire/Store-release non-exclusive
+#-----------------------------------------------------------------------------
+
+  0xe4 0xff 0xdf 0x88
+  0xe4 0xff 0xdf 0xc8
+  0xe4 0xff 0xdf 0x08
+  0xe4 0xff 0xdf 0x48
+
+# CHECK: ldar	w4, [sp]
+# CHECK: ldar	x4, [sp]
+# CHECK: ldarb	w4, [sp]
+# CHECK: ldarh	w4, [sp]
+
+  0xc3 0xfc 0x9f 0x88
+  0xc3 0xfc 0x9f 0xc8
+  0xc3 0xfc 0x9f 0x08
+  0xc3 0xfc 0x9f 0x48
+
+# CHECK: stlr	w3, [x6]
+# CHECK: stlr	x3, [x6]
+# CHECK: stlrb	w3, [x6]
+# CHECK: stlrh	w3, [x6]
+
+#-----------------------------------------------------------------------------
+# Load-acquire/Store-release exclusive
+#-----------------------------------------------------------------------------
+
+  0x82 0xfc 0x5f 0x88
+  0x82 0xfc 0x5f 0xc8
+  0x82 0xfc 0x5f 0x08
+  0x82 0xfc 0x5f 0x48
+  0x22 0x98 0x7f 0x88
+  0x22 0x98 0x7f 0xc8
+
+# CHECK: ldaxr	w2, [x4]
+# CHECK: ldaxr	x2, [x4]
+# CHECK: ldaxrb	w2, [x4]
+# CHECK: ldaxrh	w2, [x4]
+# CHECK: ldaxp	w2, w6, [x1]
+# CHECK: ldaxp	x2, x6, [x1]
+
+  0x27 0xfc 0x08 0xc8
+  0x27 0xfc 0x08 0x88
+  0x27 0xfc 0x08 0x08
+  0x27 0xfc 0x08 0x48
+  0x22 0x98 0x21 0xc8
+  0x22 0x98 0x21 0x88
+
+# CHECK: stlxr	w8, x7, [x1]
+# CHECK: stlxr	w8, w7, [x1]
+# CHECK: stlxrb	w8, w7, [x1]
+# CHECK: stlxrh	w8, w7, [x1]
+# CHECK: stlxp	w1, x2, x6, [x1]
+# CHECK: stlxp	w1, w2, w6, [x1]
+
+#-----------------------------------------------------------------------------
+# Load/Store with explicit LSL values
+#-----------------------------------------------------------------------------
+  0x20 0x78 0xa0 0xb8
+  0x20 0x78 0x60 0xf8
+  0x20 0x78 0x20 0xf8
+  0x20 0x78 0x60 0xb8
+  0x20 0x78 0x20 0xb8
+  0x20 0x78 0xe0 0x3c
+  0x20 0x78 0xa0 0x3c
+  0x20 0x78 0x60 0xfc
+  0x20 0x78 0x20 0xfc
+  0x20 0x78 0x60 0xbc
+  0x20 0x78 0x20 0xbc
+  0x20 0x78 0x60 0x7c
+  0x20 0x78 0x60 0x3c
+  0x20 0x78 0x60 0x38
+  0x20 0x78 0x20 0x38
+  0x20 0x78 0xe0 0x38
+  0x20 0x78 0x60 0x78
+  0x20 0x78 0x20 0x78
+  0x20 0x78 0xe0 0x78
+  0x20 0x78 0xa0 0x38
+  0x20 0x78 0xa0 0x78
+
+# CHECK: ldrsw	x0, [x1, x0, lsl #2]
+# CHECK: ldr	x0, [x1, x0, lsl #3]
+# CHECK: str	x0, [x1, x0, lsl #3]
+# CHECK: ldr	w0, [x1, x0, lsl #2]
+# CHECK: str	w0, [x1, x0, lsl #2]
+# CHECK: ldr	q0, [x1, x0, lsl #4]
+# CHECK: str	q0, [x1, x0, lsl #4]
+# CHECK: ldr	d0, [x1, x0, lsl #3]
+# CHECK: str	d0, [x1, x0, lsl #3]
+# CHECK: ldr	s0, [x1, x0, lsl #2]
+# CHECK: str	s0, [x1, x0, lsl #2]
+# CHECK: ldr	h0, [x1, x0, lsl #1]
+# CHECK: ldr	b0, [x1, x0, lsl #0]
+# CHECK: ldrb	w0, [x1, x0, lsl #0]
+# CHECK: strb	w0, [x1, x0, lsl #0]
+# CHECK: ldrsb	w0, [x1, x0, lsl #0]
+# CHECK: ldrh	w0, [x1, x0, lsl #1]
+# CHECK: strh	w0, [x1, x0, lsl #1]
+# CHECK: ldrsh	w0, [x1, x0, lsl #1]
+# CHECK: ldrsb	x0, [x1, x0, lsl #0]
+# CHECK: ldrsh	x0, [x1, x0, lsl #1]
diff --git a/test/MC/Disassembler/ARM64/scalar-fp.txt b/test/MC/Disassembler/ARM64/scalar-fp.txt
new file mode 100644
index 0000000000..b242df5368
--- /dev/null
+++ b/test/MC/Disassembler/ARM64/scalar-fp.txt
@@ -0,0 +1,255 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+#-----------------------------------------------------------------------------
+# Floating-point arithmetic
+#-----------------------------------------------------------------------------
+
+0x41 0xc0 0x20 0x1e
+0x41 0xc0 0x60 0x1e
+
+# CHECK: fabs s1, s2
+# CHECK: fabs d1, d2
+
+0x41 0x28 0x23 0x1e
+0x41 0x28 0x63 0x1e
+
+# CHECK: fadd s1, s2, s3
+# CHECK: fadd d1, d2, d3
+
+0x41 0x18 0x23 0x1e
+0x41 0x18 0x63 0x1e
+
+# CHECK: fdiv s1, s2, s3
+# CHECK: fdiv d1, d2, d3
+
+0x41 0x10 0x03 0x1f
+0x41 0x10 0x43 0x1f
+
+# CHECK: fmadd s1, s2, s3, s4
+# CHECK: fmadd d1, d2, d3, d4
+
+0x41 0x48 0x23 0x1e
+0x41 0x48 0x63 0x1e
+0x41 0x68 0x23 0x1e
+0x41 0x68 0x63 0x1e
+
+# CHECK: fmax   s1, s2, s3
+# CHECK: fmax   d1, d2, d3
+# CHECK: fmaxnm s1, s2, s3
+# CHECK: fmaxnm d1, d2, d3
+
+0x41 0x58 0x23 0x1e
+0x41 0x58 0x63 0x1e
+0x41 0x78 0x23 0x1e
+0x41 0x78 0x63 0x1e
+
+# CHECK: fmin   s1, s2, s3
+# CHECK: fmin   d1, d2, d3
+# CHECK: fminnm s1, s2, s3
+# CHECK: fminnm d1, d2, d3
+
+0x41 0x90 0x03 0x1f
+0x41 0x90 0x43 0x1f
+
+# CHECK: fmsub s1, s2, s3, s4
+# CHECK: fmsub d1, d2, d3, d4
+
+0x41 0x08 0x23 0x1e
+0x41 0x08 0x63 0x1e
+
+# CHECK: fmul s1, s2, s3
+# CHECK: fmul d1, d2, d3
+
+0x41 0x40 0x21 0x1e
+0x41 0x40 0x61 0x1e
+
+# CHECK: fneg s1, s2
+# CHECK: fneg d1, d2
+
+0x41 0x10 0x23 0x1f
+0x41 0x10 0x63 0x1f
+
+# CHECK: fnmadd s1, s2, s3, s4
+# CHECK: fnmadd d1, d2, d3, d4
+
+0x41 0x90 0x23 0x1f
+0x41 0x90 0x63 0x1f
+
+# CHECK: fnmsub s1, s2, s3, s4
+# CHECK: fnmsub d1, d2, d3, d4
+
+0x41 0x88 0x23 0x1e
+0x41 0x88 0x63 0x1e
+
+# CHECK: fnmul s1, s2, s3
+# CHECK: fnmul d1, d2, d3
+
+0x41 0xc0 0x21 0x1e
+0x41 0xc0 0x61 0x1e
+
+# CHECK: fsqrt s1, s2
+# CHECK: fsqrt d1, d2
+
+0x41 0x38 0x23 0x1e
+0x41 0x38 0x63 0x1e
+
+# CHECK: fsub s1, s2, s3
+# CHECK: fsub d1, d2, d3
+
+#-----------------------------------------------------------------------------
+# Floating-point comparison
+#-----------------------------------------------------------------------------
+
+0x20 0x04 0x22 0x1e
+0x20 0x04 0x62 0x1e
+0x30 0x04 0x22 0x1e
+0x30 0x04 0x62 0x1e
+
+# CHECK: fccmp  s1, s2, #0, eq
+# CHECK: fccmp  d1, d2, #0, eq
+# CHECK: fccmpe s1, s2, #0, eq
+# CHECK: fccmpe d1, d2, #0, eq
+
+0x20 0x20 0x22 0x1e
+0x20 0x20 0x62 0x1e
+0x28 0x20 0x20 0x1e
+0x28 0x20 0x60 0x1e
+0x30 0x20 0x22 0x1e
+0x30 0x20 0x62 0x1e
+0x38 0x20 0x20 0x1e
+0x38 0x20 0x60 0x1e
+
+# CHECK: fcmp  s1, s2
+# CHECK: fcmp  d1, d2
+# CHECK: fcmp  s1, #0.0
+# CHECK: fcmp  d1, #0.0
+# CHECK: fcmpe s1, s2
+# CHECK: fcmpe d1, d2
+# CHECK: fcmpe s1, #0.0
+# CHECK: fcmpe d1, #0.0
+
+#-----------------------------------------------------------------------------
+# Floating-point conditional select
+#-----------------------------------------------------------------------------
+
+0x41 0x0c 0x23 0x1e
+0x41 0x0c 0x63 0x1e
+
+# CHECK: fcsel s1, s2, s3, eq
+# CHECK: fcsel d1, d2, d3, eq
+
+#-----------------------------------------------------------------------------
+# Floating-point convert
+#-----------------------------------------------------------------------------
+
+0x41 0xc0 0x63 0x1e
+0x41 0x40 0x62 0x1e
+0x41 0xc0 0xe2 0x1e
+0x41 0x40 0xe2 0x1e
+0x41 0xc0 0x22 0x1e
+0x41 0xc0 0x23 0x1e
+
+# CHECK: fcvt h1, d2
+# CHECK: fcvt s1, d2
+# CHECK: fcvt d1, h2
+# CHECK: fcvt s1, h2
+# CHECK: fcvt d1, s2
+# CHECK: fcvt h1, s2
+
+0x41 0x00 0x44 0x1e
+0x41 0x04 0x44 0x1e
+0x41 0x00 0x44 0x9e
+0x41 0x04 0x44 0x9e
+0x41 0x00 0x04 0x1e
+0x41 0x04 0x04 0x1e
+0x41 0x00 0x04 0x9e
+0x41 0x04 0x04 0x9e
+
+#-----------------------------------------------------------------------------
+# Floating-point move
+#-----------------------------------------------------------------------------
+
+0x41 0x00 0x27 0x1e
+0x41 0x00 0x26 0x1e
+0x41 0x00 0x67 0x9e
+0x41 0x00 0x66 0x9e
+
+# CHECK: fmov s1, w2
+# CHECK: fmov w1, s2
+# CHECK: fmov d1, x2
+# CHECK: fmov x1, d2
+
+0x01 0x10 0x28 0x1e
+0x01 0x10 0x68 0x1e
+0x01 0xf0 0x7b 0x1e
+0x01 0xf0 0x6b 0x1e
+
+# CHECK: fmov s1, #1.250000e-01
+# CHECK: fmov d1, #1.250000e-01
+# CHECK: fmov d1, #-4.843750e-01
+# CHECK: fmov d1, #4.843750e-01
+
+0x41 0x40 0x20 0x1e
+0x41 0x40 0x60 0x1e
+
+# CHECK: fmov s1, s2
+# CHECK: fmov d1, d2
+
+#-----------------------------------------------------------------------------
+# Floating-point round to integral
+#-----------------------------------------------------------------------------
+
+0x41 0x40 0x26 0x1e
+0x41 0x40 0x66 0x1e
+
+# CHECK: frinta s1, s2
+# CHECK: frinta d1, d2
+
+0x41 0xc0 0x27 0x1e
+0x41 0xc0 0x67 0x1e
+
+# CHECK: frinti s1, s2
+# CHECK: frinti d1, d2
+
+0x41 0x40 0x25 0x1e
+0x41 0x40 0x65 0x1e
+
+# CHECK: frintm s1, s2
+# CHECK: frintm d1, d2
+
+0x41 0x40 0x24 0x1e
+0x41 0x40 0x64 0x1e
+
+# CHECK: frintn s1, s2
+# CHECK: frintn d1, d2
+
+0x41 0xc0 0x24 0x1e
+0x41 0xc0 0x64 0x1e
+
+# CHECK: frintp s1, s2
+# CHECK: frintp d1, d2
+
+0x41 0x40 0x27 0x1e
+0x41 0x40 0x67 0x1e
+
+# CHECK: frintx s1, s2
+# CHECK: frintx d1, d2
+
+0x41 0xc0 0x25 0x1e
+0x41 0xc0 0x65 0x1e
+
+# CHECK: frintz s1, s2
+# CHECK: frintz d1, d2
+
+  0x00 0x3c 0xe0 0x7e
+  0x00 0x8c 0xe0 0x5e
+
+# CHECK: cmhs d0, d0, d0
+# CHECK: cmtst d0, d0, d0
+
+0x00 0x00 0xaf 0x9e
+0x00 0x00 0xae 0x9e
+
+# CHECK: fmov.d v0[1], x0
+# CHECK: fmov.d x0, v0[1]
+
diff --git a/test/MC/Disassembler/ARM64/system.txt b/test/MC/Disassembler/ARM64/system.txt
new file mode 100644
index 0000000000..cefa635845
--- /dev/null
+++ b/test/MC/Disassembler/ARM64/system.txt
@@ -0,0 +1,58 @@
+# RUN: llvm-mc -triple arm64-apple-darwin --disassemble < %s | FileCheck %s
+
+
+#-----------------------------------------------------------------------------
+# Hint encodings
+#-----------------------------------------------------------------------------
+
+  0x1f 0x20 0x03 0xd5
+# CHECK: nop
+  0x9f 0x20 0x03 0xd5
+# CHECK: sev
+  0xbf 0x20 0x03 0xd5
+# CHECK: sevl
+  0x5f 0x20 0x03 0xd5
+# CHECK: wfe
+  0x7f 0x20 0x03 0xd5
+# CHECK: wfi
+  0x3f 0x20 0x03 0xd5
+# CHECK: yield
+
+#-----------------------------------------------------------------------------
+# Single-immediate operand instructions
+#-----------------------------------------------------------------------------
+
+  0x5f 0x3a 0x03 0xd5
+# CHECK: clrex #10
+  0xdf 0x3f 0x03 0xd5
+# CHECK: isb{{$}}
+  0xbf 0x33 0x03 0xd5
+# CHECK: dmb osh
+  0x9f 0x37 0x03 0xd5
+# CHECK: dsb nsh
+
+#-----------------------------------------------------------------------------
+# Generic system instructions
+#-----------------------------------------------------------------------------
+  0xff 0x05 0x0a 0xd5
+  0xe7 0x6a 0x0f 0xd5
+  0xf4 0x3f 0x2e 0xd5
+  0xbf 0x40 0x00 0xd5
+  0x00 0x00 0x10 0xd5
+  0x00 0x00 0x30 0xd5
+
+# CHECK: sys #2, c0, c5, #7
+# CHECK: sys #7, c6, c10, #7, x7
+# CHECK: sysl  x20, #6, c3, c15, #7
+# CHECK: msr  SPSel, #0
+# CHECK: msr S2_0_C0_C0_0, x0
+# CHECK: mrs x0, S2_0_C0_C0_0
+
+  0x40 0xc0 0x1e 0xd5
+  0x40 0xc0 0x1a 0xd5
+  0x40 0xc0 0x19 0xd5
+
+# CHECK: msr RMR_EL3, x0
+# CHECK: msr RMR_EL2, x0
+# CHECK: msr RMR_EL1, x0
+
diff --git a/test/MC/MachO/ARM64/darwin-ARM64-local-label-diff.s b/test/MC/MachO/ARM64/darwin-ARM64-local-label-diff.s
new file mode 100644
index 0000000000..d98c257c85
--- /dev/null
+++ b/test/MC/MachO/ARM64/darwin-ARM64-local-label-diff.s
@@ -0,0 +1,21 @@
+; RUN: llvm-mc -triple arm64-apple-darwin -filetype=obj -o - < %s | macho-dump -dump-section-data | FileCheck %s
+; rdar://13028719
+
+ .globl context_save0
+ .align 6
+Lcontext_save0:
+context_save0:
+ .fill 2, 8, 5
+Lcontext_save0_end:
+Lcontext_save0_size: .quad (Lcontext_save0_end - Lcontext_save0)
+
+ .align 6
+Lcontext_save1:
+ .fill 2, 8, 0
+Lcontext_save1_end:
+Lcontext_save1_size: .quad (Lcontext_save1_end - Lcontext_save1)
+
+Llockup_release:
+ .quad 0
+
+; CHECK:  ('_section_data', '05000000 00000000 05000000 00000000 10000000 00000000 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 1f2003d5 00000000 00000000 00000000 00000000 10000000 00000000 00000000 00000000')
diff --git a/test/MC/MachO/ARM64/darwin-ARM64-reloc.s b/test/MC/MachO/ARM64/darwin-ARM64-reloc.s
new file mode 100644
index 0000000000..7f586aedd6
--- /dev/null
+++ b/test/MC/MachO/ARM64/darwin-ARM64-reloc.s
@@ -0,0 +1,157 @@
+; RUN: llvm-mc -n -triple arm64-apple-darwin10 %s -filetype=obj -o - | macho-dump --dump-section-data | FileCheck %s
+
+	.text
+_fred:
+	bl	_func
+	bl	_func + 20
+
+	adrp	x3, _data@page
+        ldr	w2, [x3, _data@pageoff]
+
+        add	x3, x3, _data@pageoff + 4
+
+	adrp	x3, _data@page+1
+        ldr	w2, [x3, _data@pageoff + 4]
+
+	adrp	x3, _data_ext@gotpage
+        ldr	w2, [x3, _data_ext@gotpageoff]
+
+	.data
+_data:
+        .quad _foo
+        .quad _foo + 4
+        .quad _foo - _bar
+        .quad _foo - _bar + 4
+
+        .long _foo - _bar
+
+        .quad _foo@got
+        .long _foo@got - .
+
+
+; CHECK: ('cputype', 16777228)
+; CHECK: ('cpusubtype', 0)
+; CHECK: ('filetype', 1)
+; CHECK: ('num_load_commands', 3)
+; CHECK: ('load_commands_size', 336)
+; CHECK: ('flag', 0)
+; CHECK: ('reserved', 0)
+; CHECK: ('load_commands', [
+; CHECK:   # Load Command 0
+; CHECK:  (('command', 25)
+; CHECK:   ('size', 232)
+; CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:   ('vm_addr', 0)
+; CHECK:   ('vm_size', 84)
+; CHECK:   ('file_offset', 368)
+; CHECK:   ('file_size', 84)
+; CHECK:   ('maxprot', 7)
+; CHECK:   ('initprot', 7)
+; CHECK:   ('num_sections', 2)
+; CHECK:   ('flags', 0)
+; CHECK:   ('sections', [
+; CHECK:     # Section 0
+; CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:     ('address', 0)
+; CHECK:     ('size', 36)
+; CHECK:     ('offset', 368)
+; CHECK:     ('alignment', 0)
+; CHECK:     ('reloc_offset', 452)
+; CHECK:     ('num_reloc', 13)
+; CHECK:     ('flags', 0x80000400)
+; CHECK:     ('reserved1', 0)
+; CHECK:     ('reserved2', 0)
+; CHECK:     ('reserved3', 0)
+; CHECK:    ),
+; CHECK:   ('_relocations', [
+; CHECK:     # Relocation 0
+; CHECK:     (('word-0', 0x20),
+; CHECK:      ('word-1', 0x6c000005)),
+; CHECK:     # Relocation 1
+; CHECK:     (('word-0', 0x1c),
+; CHECK:      ('word-1', 0x5d000005)),
+; CHECK:     # Relocation 2
+; CHECK:     (('word-0', 0x18),
+; CHECK:      ('word-1', 0xa4000004)),
+; CHECK:     # Relocation 3
+; CHECK:     (('word-0', 0x18),
+; CHECK:      ('word-1', 0x4c000002)),
+; CHECK:     # Relocation 4
+; CHECK:     (('word-0', 0x14),
+; CHECK:      ('word-1', 0xa4000001)),
+; CHECK:     # Relocation 5
+; CHECK:     (('word-0', 0x14),
+; CHECK:      ('word-1', 0x3d000002)),
+; CHECK:     # Relocation 6
+; CHECK:     (('word-0', 0x10),
+; CHECK:      ('word-1', 0xa4000004)),
+; CHECK:     # Relocation 7
+; CHECK:     (('word-0', 0x10),
+; CHECK:      ('word-1', 0x4c000002)),
+; CHECK:     # Relocation 8
+; CHECK:     (('word-0', 0xc),
+; CHECK:      ('word-1', 0x4c000002)),
+; CHECK:     # Relocation 9
+; CHECK:     (('word-0', 0x8),
+; CHECK:      ('word-1', 0x3d000002)),
+; CHECK:     # Relocation 10
+; CHECK:     (('word-0', 0x4),
+; CHECK:      ('word-1', 0xa4000014)),
+; CHECK:     # Relocation 11
+; CHECK:     (('word-0', 0x4),
+; CHECK:      ('word-1', 0x2d000007)),
+; CHECK:     # Relocation 12
+; CHECK:     (('word-0', 0x0),
+; CHECK:      ('word-1', 0x2d000007)),
+; CHECK:   ])
+; CHECK:   ('_section_data', '00000094 00000094 03000090 620040b9 63000091 03000090 620040b9 03000090 620040b9')
+; CHECK:     # Section 1
+; CHECK:    (('section_name', '__data\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:     ('segment_name', '__DATA\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+; CHECK:     ('address', 36)
+; CHECK:     ('size', 48)
+; CHECK:     ('offset', 404)
+; CHECK:     ('alignment', 0)
+; CHECK:     ('reloc_offset', 556)
+; CHECK:     ('num_reloc', 10)
+; CHECK:     ('flags', 0x0)
+; CHECK:     ('reserved1', 0)
+; CHECK:     ('reserved2', 0)
+; CHECK:     ('reserved3', 0)
+; CHECK:    ),
+; CHECK:   ('_relocations', [
+; CHECK:     # Relocation 0
+; CHECK:     (('word-0', 0x2c),
+; CHECK:      ('word-1', 0x7d000006)),
+; CHECK:     # Relocation 1
+; CHECK:     (('word-0', 0x24),
+; CHECK:      ('word-1', 0x7e000006)),
+; CHECK:     # Relocation 2
+; CHECK:     (('word-0', 0x20),
+; CHECK:      ('word-1', 0x1c000004)),
+; CHECK:     # Relocation 3
+; CHECK:     (('word-0', 0x20),
+; CHECK:      ('word-1', 0xc000006)),
+; CHECK:     # Relocation 4
+; CHECK:     (('word-0', 0x18),
+; CHECK:      ('word-1', 0x1e000004)),
+; CHECK:     # Relocation 5
+; CHECK:     (('word-0', 0x18),
+; CHECK:      ('word-1', 0xe000006)),
+; CHECK:     # Relocation 6
+; CHECK:     (('word-0', 0x10),
+; CHECK:      ('word-1', 0x1e000004)),
+; CHECK:     # Relocation 7
+; CHECK:     (('word-0', 0x10),
+; CHECK:      ('word-1', 0xe000006)),
+; CHECK:     # Relocation 8
+; CHECK:     (('word-0', 0x8),
+; CHECK:      ('word-1', 0xe000006)),
+; CHECK:     # Relocation 9
+; CHECK:     (('word-0', 0x0),
+; CHECK:      ('word-1', 0xe000006)),
+; CHECK:   ])
+; CHECK:   ('_section_data', '00000000 00000000 04000000 00000000 00000000 00000000 04000000 00000000 00000000 00000000 00000000 d4ffffff')
+; CHECK:   ])
+; CHECK:  ),
diff --git a/test/MC/MachO/ARM64/lit.local.cfg b/test/MC/MachO/ARM64/lit.local.cfg
new file mode 100644
index 0000000000..a75a42b6f7
--- /dev/null
+++ b/test/MC/MachO/ARM64/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'ARM64' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/GlobalMerge/ARM/arm.ll b/test/Transforms/GlobalMerge/ARM/arm.ll
new file mode 100644
index 0000000000..8c77de62ec
--- /dev/null
+++ b/test/Transforms/GlobalMerge/ARM/arm.ll
@@ -0,0 +1,85 @@
+; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
+; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s
+
+; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
+; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
+
+; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2
+; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2
+; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2
+; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios3.0.0"
+
+@bar = internal global [5 x i32] zeroinitializer, align 4
+@baz = internal global [5 x i32] zeroinitializer, align 4
+@foo = internal global [5 x i32] zeroinitializer, align 4
+
+; Function Attrs: nounwind ssp
+define internal void @initialize() #0 {
+  %1 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %1, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 0), align 4, !tbaa !1
+  %2 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %2, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 0), align 4, !tbaa !1
+  %3 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 1), align 4, !tbaa !1
+  %4 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %4, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 1), align 4, !tbaa !1
+  %5 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %5, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 2), align 4, !tbaa !1
+  %6 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 2), align 4, !tbaa !1
+  %7 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %7, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 3), align 4, !tbaa !1
+  %8 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %8, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 3), align 4, !tbaa !1
+  %9 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 4), align 4, !tbaa !1
+  %10 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #3
+  store i32 %10, i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 4), align 4, !tbaa !1
+  ret void
+}
+
+declare i32 @calc(...) #1
+
+; Function Attrs: nounwind ssp
+define internal void @calculate() #0 {
+  %1 = load <4 x i32>* bitcast ([5 x i32]* @bar to <4 x i32>*), align 4
+  %2 = load <4 x i32>* bitcast ([5 x i32]* @baz to <4 x i32>*), align 4
+  %3 = mul <4 x i32> %2, %1
+  store <4 x i32> %3, <4 x i32>* bitcast ([5 x i32]* @foo to <4 x i32>*), align 4
+  %4 = load i32* getelementptr inbounds ([5 x i32]* @bar, i32 0, i32 4), align 4, !tbaa !1
+  %5 = load i32* getelementptr inbounds ([5 x i32]* @baz, i32 0, i32 4), align 4, !tbaa !1
+  %6 = mul nsw i32 %5, %4
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @foo, i32 0, i32 4), align 4, !tbaa !1
+  ret void
+}
+
+; Function Attrs: nounwind readnone ssp
+define internal i32* @returnFoo() #2 {
+  ret i32* getelementptr inbounds ([5 x i32]* @foo, i32 0, i32 0)
+}
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind }
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"LLVM version 3.4 "}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/GlobalMerge/ARM/lit.local.cfg b/test/Transforms/GlobalMerge/ARM/lit.local.cfg
new file mode 100644
index 0000000000..8a3ba96497
--- /dev/null
+++ b/test/Transforms/GlobalMerge/ARM/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'ARM' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/GlobalMerge/ARM64/arm64.ll b/test/Transforms/GlobalMerge/ARM64/arm64.ll
new file mode 100644
index 0000000000..eea474a74f
--- /dev/null
+++ b/test/Transforms/GlobalMerge/ARM64/arm64.ll
@@ -0,0 +1,88 @@
+; RUN: llc %s -O0 -o - | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O0 -o - -global-merge=true | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - | FileCheck -check-prefix=MERGE %s
+; RUN: llc %s -O1 -o - -global-merge=false | FileCheck -check-prefix=NO-MERGE %s
+; RUN: llc %s -O1 -o - -global-merge=true | FileCheck -check-prefix=MERGE %s
+
+; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
+; MERGE: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; MERGE-NOT: .zerofill __DATA,__bss,_bar,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_baz,20,2
+; MERGE-NOT: .zerofill __DATA,__bss,_foo,20,2
+
+; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+; NO-MERGE: .zerofill __DATA,__bss,_bar,20,2
+; NO-MERGE: .zerofill __DATA,__bss,_baz,20,2
+; NO-MERGE: .zerofill __DATA,__bss,_foo,20,2
+; NO-MERGE-NOT: .zerofill __DATA,__bss,__MergedGlobals,60,4
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+target triple = "arm64-apple-ios7.0.0"
+
+@bar = internal global [5 x i32] zeroinitializer, align 4
+@baz = internal global [5 x i32] zeroinitializer, align 4
+@foo = internal global [5 x i32] zeroinitializer, align 4
+
+; Function Attrs: nounwind ssp
+define internal void @initialize() #0 {
+  %1 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %1, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
+  %2 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %2, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
+  %3 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
+  %4 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %4, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
+  %5 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %5, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
+  %6 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
+  %7 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %7, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
+  %8 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %8, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
+  %9 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
+  %10 = tail call i32 bitcast (i32 (...)* @calc to i32 ()*)() #2
+  store i32 %10, i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
+  ret void
+}
+
+declare i32 @calc(...)
+
+; Function Attrs: nounwind ssp
+define internal void @calculate() #0 {
+  %1 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 0), align 4
+  %2 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 0), align 4
+  %3 = mul nsw i32 %2, %1
+  store i32 %3, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0), align 4
+  %4 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 1), align 4
+  %5 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 1), align 4
+  %6 = mul nsw i32 %5, %4
+  store i32 %6, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 1), align 4
+  %7 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 2), align 4
+  %8 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 2), align 4
+  %9 = mul nsw i32 %8, %7
+  store i32 %9, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 2), align 4
+  %10 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 3), align 4
+  %11 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 3), align 4
+  %12 = mul nsw i32 %11, %10
+  store i32 %12, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 3), align 4
+  %13 = load i32* getelementptr inbounds ([5 x i32]* @bar, i64 0, i64 4), align 4
+  %14 = load i32* getelementptr inbounds ([5 x i32]* @baz, i64 0, i64 4), align 4
+  %15 = mul nsw i32 %14, %13
+  store i32 %15, i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 4), align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone ssp
+define internal i32* @returnFoo() #1 {
+  ret i32* getelementptr inbounds ([5 x i32]* @foo, i64 0, i64 0)
+}
+
+attributes #0 = { nounwind ssp }
+attributes #1 = { nounwind readnone ssp }
+attributes #2 = { nounwind }
diff --git a/test/Transforms/GlobalMerge/ARM64/lit.local.cfg b/test/Transforms/GlobalMerge/ARM64/lit.local.cfg
new file mode 100644
index 0000000000..a75a42b6f7
--- /dev/null
+++ b/test/Transforms/GlobalMerge/ARM64/lit.local.cfg
@@ -0,0 +1,4 @@
+targets = set(config.root.targets_to_build.split())
+if not 'ARM64' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
index 2dedd44e2b..1883a8fc8e 100644
--- a/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
+++ b/test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll
@@ -1,6 +1,3 @@
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
-target triple = "thumbv7-apple-ios0"
-
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
 define <4 x i32> @mulByZero(<4 x i16> %x) nounwind readnone ssp {
@@ -67,6 +64,72 @@ entry:
 declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
 declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
 
+; ARM64 variants - <rdar://problem/12349617>
+
+define <4 x i32> @mulByZeroARM64(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> zeroinitializer) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> zeroinitializer
+}
+
+define <4 x i32> @mulByOneARM64(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> %x, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: %a = sext <4 x i16> %x to <4 x i32>
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @constantMulARM64() nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  ret <4 x i32> %a
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 6, i32 6, i32 6, i32 6>
+}
+
+define <4 x i32> @constantMulSARM64() nounwind readnone ssp {
+entry:
+  %b = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
+}
+
+define <4 x i32> @constantMulUARM64() nounwind readnone ssp {
+entry:
+  %b = tail call <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16> <i16 -1, i16 -1, i16 -1, i16 -1>, <4 x i16> <i16 1, i16 1, i16 1, i16 1>) nounwind
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: ret <4 x i32> <i32 65535, i32 65535, i32 65535, i32 65535>
+}
+
+define <4 x i32> @complex1ARM64(<4 x i16> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) nounwind
+  %b = add <4 x i32> zeroinitializer, %a
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 2, i16 2, i16 2, i16 2>, <4 x i16> %x) [[NUW:#[0-9]+]]
+; CHECK-NEXT: ret <4 x i32> %a
+}
+
+define <4 x i32> @complex2ARM64(<4 x i32> %x) nounwind readnone ssp {
+entry:
+  %a = tail call <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16> <i16 3, i16 3, i16 3, i16 3>, <4 x i16> <i16 2, i16 2, i16 2, i16 2>) nounwind
+  %b = add <4 x i32> %x, %a
+  ret <4 x i32> %b
+; CHECK: entry:
+; CHECK-NEXT: %b = add <4 x i32> %x, <i32 6, i32 6, i32 6, i32 6>
+; CHECK-NEXT: ret <4 x i32> %b
+}
+
+declare <4 x i32> @llvm.arm64.neon.smull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+declare <4 x i32> @llvm.arm64.neon.umull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+
 ; CHECK: attributes #0 = { nounwind readnone ssp }
 ; CHECK: attributes #1 = { nounwind readnone }
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/InstCombine/sincospi.ll b/test/Transforms/InstCombine/sincospi.ll
index c810ae475a..739827f196 100644
--- a/test/Transforms/InstCombine/sincospi.ll
+++ b/test/Transforms/InstCombine/sincospi.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -instcombine -S < %s -mtriple=x86_64-apple-macosx10.9 | FileCheck %s --check-prefix=CHECK-FLOAT-IN-VEC
 ; RUN: opt -instcombine -S < %s -mtriple=arm-apple-ios7.0 | FileCheck %s
+; RUN: opt -instcombine -S < %s -mtriple=arm64-apple-ios7.0 | FileCheck %s
 ; RUN: opt -instcombine -S < %s -mtriple=x86_64-apple-macosx10.8 | FileCheck %s --check-prefix=CHECK-NO-SINCOS
 ; RUN: opt -instcombine -S < %s -mtriple=arm-apple-ios6.0 | FileCheck %s --check-prefix=CHECK-NO-SINCOS
 ; RUN: opt -instcombine -S < %s -mtriple=x86_64-none-linux-gnu | FileCheck %s --check-prefix=CHECK-NO-SINCOS
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg b/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg
new file mode 100644
index 0000000000..a49957999f
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/ARM64/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll']
+
+targets = set(config.root.targets_to_build.split())
+if not 'ARM64' in targets:
+    config.unsupported = True
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll b/test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll
new file mode 100644
index 0000000000..16f6afa6f5
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/ARM64/lsr-memcpy.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=arm64 -mcpu=cyclone -pre-RA-sched=list-hybrid < %s | FileCheck %s
+; rdar://10232252
+; Prevent LSR of doing poor choice that cannot be folded in addressing mode
+
+; Remove the -pre-RA-sched=list-hybrid option after fixing:
+; <rdar://problem/12702735> [ARM64][coalescer] need better register
+; coalescing for simple unit tests.
+
+; CHECK: testCase
+; CHECK: %while.body{{$}}
+; CHECK: ldr [[STREG:x[0-9]+]], [{{x[0-9]+}}], #8
+; CHECK-NEXT: str [[STREG]], [{{x[0-9]+}}], #8
+; CHECK: %while.end
+define i32 @testCase() nounwind ssp {
+entry:
+  br label %while.body
+
+while.body:                                       ; preds = %while.body, %entry
+  %len.06 = phi i64 [ 1288, %entry ], [ %sub, %while.body ]
+  %pDst.05 = phi i64* [ inttoptr (i64 6442450944 to i64*), %entry ], [ %incdec.ptr1, %while.body ]
+  %pSrc.04 = phi i64* [ inttoptr (i64 4294967296 to i64*), %entry ], [ %incdec.ptr, %while.body ]
+  %incdec.ptr = getelementptr inbounds i64* %pSrc.04, i64 1
+  %tmp = load volatile i64* %pSrc.04, align 8
+  %incdec.ptr1 = getelementptr inbounds i64* %pDst.05, i64 1
+  store volatile i64 %tmp, i64* %pDst.05, align 8
+  %sub = add i64 %len.06, -8
+  %cmp = icmp sgt i64 %sub, -1
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body
+  tail call void inttoptr (i64 6442450944 to void ()*)() nounwind
+  ret i32 0
+}
diff --git a/test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll b/test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll
new file mode 100644
index 0000000000..19208025a4
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/ARM64/lsr-memset.ll
@@ -0,0 +1,101 @@
+; RUN: llc < %s -O3 -march=arm64 -mcpu=cyclone -pre-RA-sched=list-hybrid | FileCheck %s
+; <rdar://problem/11635990> [arm64] [lsr] Inefficient EA/loop-exit calc in bzero_phys
+;
+; LSR on loop %while.cond should reassociate non-address mode
+; expressions at use %cmp16 to avoid sinking computation into %while.body18.
+;
+; Remove the -pre-RA-sched=list-hybrid option after fixing:
+; <rdar://problem/12702735> [ARM64][coalescer] need better register
+; coalescing for simple unit tests.
+
+; CHECK: @memset
+; CHECK: %while.body18{{$}}
+; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #8
+; First set the IVREG variable, then use it
+; CHECK-NEXT: sub [[IVREG:x[0-9]+]],
+; CHECK: [[IVREG]], #8
+; CHECK-NEXT: cmp  [[IVREG]], #7
+; CHECK-NEXT: b.hi
+define i8* @memset(i8* %dest, i32 %val, i64 %len) nounwind ssp noimplicitfloat {
+entry:
+  %cmp = icmp eq i64 %len, 0
+  br i1 %cmp, label %done, label %while.cond.preheader
+
+while.cond.preheader:                             ; preds = %entry
+  %conv = trunc i32 %val to i8
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %while.cond.preheader
+  %ptr.0 = phi i8* [ %incdec.ptr, %while.body ], [ %dest, %while.cond.preheader ]
+  %len.addr.0 = phi i64 [ %dec, %while.body ], [ %len, %while.cond.preheader ]
+  %cond = icmp eq i64 %len.addr.0, 0
+  br i1 %cond, label %done, label %land.rhs
+
+land.rhs:                                         ; preds = %while.cond
+  %0 = ptrtoint i8* %ptr.0 to i64
+  %and = and i64 %0, 7
+  %cmp5 = icmp eq i64 %and, 0
+  br i1 %cmp5, label %if.end9, label %while.body
+
+while.body:                                       ; preds = %land.rhs
+  %incdec.ptr = getelementptr inbounds i8* %ptr.0, i64 1
+  store i8 %conv, i8* %ptr.0, align 1, !tbaa !0
+  %dec = add i64 %len.addr.0, -1
+  br label %while.cond
+
+if.end9:                                          ; preds = %land.rhs
+  %conv.mask = and i32 %val, 255
+  %1 = zext i32 %conv.mask to i64
+  %2 = shl nuw nsw i64 %1, 8
+  %ins18 = or i64 %2, %1
+  %3 = shl nuw nsw i64 %1, 16
+  %ins15 = or i64 %ins18, %3
+  %4 = shl nuw nsw i64 %1, 24
+  %5 = shl nuw nsw i64 %1, 32
+  %mask8 = or i64 %ins15, %4
+  %6 = shl nuw nsw i64 %1, 40
+  %mask5 = or i64 %mask8, %5
+  %7 = shl nuw nsw i64 %1, 48
+  %8 = shl nuw i64 %1, 56
+  %mask2.masked = or i64 %mask5, %6
+  %mask = or i64 %mask2.masked, %7
+  %ins = or i64 %mask, %8
+  %9 = bitcast i8* %ptr.0 to i64*
+  %cmp1636 = icmp ugt i64 %len.addr.0, 7
+  br i1 %cmp1636, label %while.body18, label %while.body29.lr.ph
+
+while.body18:                                     ; preds = %if.end9, %while.body18
+  %wideptr.038 = phi i64* [ %incdec.ptr19, %while.body18 ], [ %9, %if.end9 ]
+  %len.addr.137 = phi i64 [ %sub, %while.body18 ], [ %len.addr.0, %if.end9 ]
+  %incdec.ptr19 = getelementptr inbounds i64* %wideptr.038, i64 1
+  store i64 %ins, i64* %wideptr.038, align 8, !tbaa !2
+  %sub = add i64 %len.addr.137, -8
+  %cmp16 = icmp ugt i64 %sub, 7
+  br i1 %cmp16, label %while.body18, label %while.end20
+
+while.end20:                                      ; preds = %while.body18
+  %cmp21 = icmp eq i64 %sub, 0
+  br i1 %cmp21, label %done, label %while.body29.lr.ph
+
+while.body29.lr.ph:                               ; preds = %while.end20, %if.end9
+  %len.addr.1.lcssa49 = phi i64 [ %sub, %while.end20 ], [ %len.addr.0, %if.end9 ]
+  %wideptr.0.lcssa48 = phi i64* [ %incdec.ptr19, %while.end20 ], [ %9, %if.end9 ]
+  %10 = bitcast i64* %wideptr.0.lcssa48 to i8*
+  br label %while.body29
+
+while.body29:                                     ; preds = %while.body29, %while.body29.lr.ph
+  %len.addr.235 = phi i64 [ %len.addr.1.lcssa49, %while.body29.lr.ph ], [ %dec26, %while.body29 ]
+  %ptr.134 = phi i8* [ %10, %while.body29.lr.ph ], [ %incdec.ptr31, %while.body29 ]
+  %dec26 = add i64 %len.addr.235, -1
+  %incdec.ptr31 = getelementptr inbounds i8* %ptr.134, i64 1
+  store i8 %conv, i8* %ptr.134, align 1, !tbaa !0
+  %cmp27 = icmp eq i64 %dec26, 0
+  br i1 %cmp27, label %done, label %while.body29
+
+done:                                             ; preds = %while.cond, %while.body29, %while.end20, %entry
+  ret i8* %dest
+}
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA"}
+!2 = metadata !{metadata !"long long", metadata !0}
diff --git a/test/Transforms/LoopVectorize/ARM64/gather-cost.ll b/test/Transforms/LoopVectorize/ARM64/gather-cost.ll
new file mode 100644
index 0000000000..bb285382e5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM64/gather-cost.ll
@@ -0,0 +1,85 @@
+; RUN: opt -loop-vectorize -mtriple=arm64-apple-ios -S -mcpu=cyclone < %s | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n32:64-S128"
+
+@kernel = global [512 x float] zeroinitializer, align 16
+@kernel2 = global [512 x float] zeroinitializer, align 16
+@kernel3 = global [512 x float] zeroinitializer, align 16
+@kernel4 = global [512 x float] zeroinitializer, align 16
+@src_data = global [1536 x float] zeroinitializer, align 16
+@r_ = global i8 0, align 1
+@g_ = global i8 0, align 1
+@b_ = global i8 0, align 1
+
+; We don't want to vectorize most loops containing gathers because they are
+; expensive.
+; Make sure we don't vectorize it.
+; CHECK-NOT: x float>
+
+define void @_Z4testmm(i64 %size, i64 %offset) {
+entry:
+  %cmp53 = icmp eq i64 %size, 0
+  br i1 %cmp53, label %for.end, label %for.body.lr.ph
+
+for.body.lr.ph:
+  br label %for.body
+
+for.body:
+  %r.057 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add10, %for.body ]
+  %g.056 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add20, %for.body ]
+  %v.055 = phi i64 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
+  %b.054 = phi float [ 0.000000e+00, %for.body.lr.ph ], [ %add30, %for.body ]
+  %add = add i64 %v.055, %offset
+  %mul = mul i64 %add, 3
+  %arrayidx = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %mul
+  %0 = load float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds [512 x float]* @kernel, i64 0, i64 %v.055
+  %1 = load float* %arrayidx2, align 4
+  %mul3 = fmul fast float %0, %1
+  %arrayidx4 = getelementptr inbounds [512 x float]* @kernel2, i64 0, i64 %v.055
+  %2 = load float* %arrayidx4, align 4
+  %mul5 = fmul fast float %mul3, %2
+  %arrayidx6 = getelementptr inbounds [512 x float]* @kernel3, i64 0, i64 %v.055
+  %3 = load float* %arrayidx6, align 4
+  %mul7 = fmul fast float %mul5, %3
+  %arrayidx8 = getelementptr inbounds [512 x float]* @kernel4, i64 0, i64 %v.055
+  %4 = load float* %arrayidx8, align 4
+  %mul9 = fmul fast float %mul7, %4
+  %add10 = fadd fast float %r.057, %mul9
+  %arrayidx.sum = add i64 %mul, 1
+  %arrayidx11 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum
+  %5 = load float* %arrayidx11, align 4
+  %mul13 = fmul fast float %1, %5
+  %mul15 = fmul fast float %2, %mul13
+  %mul17 = fmul fast float %3, %mul15
+  %mul19 = fmul fast float %4, %mul17
+  %add20 = fadd fast float %g.056, %mul19
+  %arrayidx.sum52 = add i64 %mul, 2
+  %arrayidx21 = getelementptr inbounds [1536 x float]* @src_data, i64 0, i64 %arrayidx.sum52
+  %6 = load float* %arrayidx21, align 4
+  %mul23 = fmul fast float %1, %6
+  %mul25 = fmul fast float %2, %mul23
+  %mul27 = fmul fast float %3, %mul25
+  %mul29 = fmul fast float %4, %mul27
+  %add30 = fadd fast float %b.054, %mul29
+  %inc = add i64 %v.055, 1
+  %exitcond = icmp ne i64 %inc, %size
+  br i1 %exitcond, label %for.body, label %for.cond.for.end_crit_edge
+
+for.cond.for.end_crit_edge:
+  %add30.lcssa = phi float [ %add30, %for.body ]
+  %add20.lcssa = phi float [ %add20, %for.body ]
+  %add10.lcssa = phi float [ %add10, %for.body ]
+  %phitmp = fptoui float %add10.lcssa to i8
+  %phitmp60 = fptoui float %add20.lcssa to i8
+  %phitmp61 = fptoui float %add30.lcssa to i8
+  br label %for.end
+
+for.end:
+  %r.0.lcssa = phi i8 [ %phitmp, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %g.0.lcssa = phi i8 [ %phitmp60, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  %b.0.lcssa = phi i8 [ %phitmp61, %for.cond.for.end_crit_edge ], [ 0, %entry ]
+  store i8 %r.0.lcssa, i8* @r_, align 1
+  store i8 %g.0.lcssa, i8* @g_, align 1
+  store i8 %b.0.lcssa, i8* @b_, align 1
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/ARM64/lit.local.cfg b/test/Transforms/LoopVectorize/ARM64/lit.local.cfg
new file mode 100644
index 0000000000..de86e54852
--- /dev/null
+++ b/test/Transforms/LoopVectorize/ARM64/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'ARM64' in targets:
+    config.unsupported = True
+
-- 
cgit v1.2.3