summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHal Finkel <hfinkel@anl.gov>2012-11-14 22:58:30 +0000
committerHal Finkel <hfinkel@anl.gov>2012-11-14 22:58:30 +0000
commitadea7bc349a18a1c8f9ba94bff61d5938075e600 (patch)
tree0afc76dd97f6999e05fb2da19c12be046753494b
parent9fb32372e7ce645badf32b64e568a9cb5bef2442 (diff)
downloadllvm-adea7bc349a18a1c8f9ba94bff61d5938075e600.tar.gz
llvm-adea7bc349a18a1c8f9ba94bff61d5938075e600.tar.bz2
llvm-adea7bc349a18a1c8f9ba94bff61d5938075e600.tar.xz
Merge BBVectorizer changes r167731, r167743, r167750, r167784, r167811, r167817.
These changes fix a serious interaction problem with the cost model on x86 that could cause the vectorizer to enter an infinite loop (and sometimes crash in other ways). git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_32@167993 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Transforms/Vectorize/BBVectorize.cpp112
-rw-r--r--test/Transforms/BBVectorize/X86/cmp-types.ll16
-rw-r--r--test/Transforms/BBVectorize/X86/sh-rec.ll54
-rw-r--r--test/Transforms/BBVectorize/X86/sh-rec2.ll85
-rw-r--r--test/Transforms/BBVectorize/X86/sh-rec3.ll170
-rw-r--r--test/Transforms/BBVectorize/X86/sh-types.ll25
6 files changed, 433 insertions, 29 deletions
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 4653a7d7c8..df50589eb6 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -28,6 +28,7 @@
#include "llvm/Type.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/STLExtras.h"
@@ -483,6 +484,10 @@ namespace {
if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
T2 = SI->getCondition()->getType();
+ } else if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(I)) {
+ T2 = SI->getOperand(0)->getType();
+ } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
+ T2 = CI->getOperand(0)->getType();
}
}
@@ -671,6 +676,19 @@ namespace {
return false;
}
+
+ bool isPureIEChain(InsertElementInst *IE) {
+ InsertElementInst *IENext = IE;
+ do {
+ if (!isa<UndefValue>(IENext->getOperand(0)) &&
+ !isa<InsertElementInst>(IENext->getOperand(0))) {
+ return false;
+ }
+ } while ((IENext =
+ dyn_cast<InsertElementInst>(IENext->getOperand(0))));
+
+ return true;
+ }
};
// This function implements one vectorization iteration on the provided
@@ -987,10 +1005,11 @@ namespace {
// We don't want to fuse to a type that will be split, even
// if the two input types will also be split and there is no other
// associated cost.
- unsigned VParts = VTTI->getNumberOfParts(VT1);
- if (VParts > 1)
+ unsigned VParts1 = VTTI->getNumberOfParts(VT1),
+ VParts2 = VTTI->getNumberOfParts(VT2);
+ if (VParts1 > 1 || VParts2 > 1)
return false;
- else if (!VParts && VCost == ICost + JCost)
+ else if ((!VParts1 || !VParts2) && VCost == ICost + JCost)
return false;
CostSavings = ICost + JCost - VCost;
@@ -1683,10 +1702,20 @@ namespace {
// The set of pairs that have already contributed to the total cost.
DenseSet<ValuePair> IncomingPairs;
+ // If the cost model were perfect, this might not be necessary; but we
+ // need to make sure that we don't get stuck vectorizing our own
+ // shuffle chains.
+ bool HasNontrivialInsts = false;
+
// The node weights represent the cost savings associated with
// fusing the pair of instructions.
for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
E = PrunedTree.end(); S != E; ++S) {
+ if (!isa<ShuffleVectorInst>(S->first) &&
+ !isa<InsertElementInst>(S->first) &&
+ !isa<ExtractElementInst>(S->first))
+ HasNontrivialInsts = true;
+
bool FlipOrder = false;
if (getDepthFactor(S->first)) {
@@ -1760,9 +1789,12 @@ namespace {
bool NeedsExtraction = false;
for (Value::use_iterator I = S->first->use_begin(),
IE = S->first->use_end(); I != IE; ++I) {
- if (isa<ShuffleVectorInst>(*I) ||
- isa<InsertElementInst>(*I) ||
- isa<ExtractElementInst>(*I))
+ if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
+ // Shuffle can be folded if it has no other input
+ if (isa<UndefValue>(SI->getOperand(1)))
+ continue;
+ }
+ if (isa<ExtractElementInst>(*I))
continue;
if (PrunedTreeInstrs.count(*I))
continue;
@@ -1787,9 +1819,12 @@ namespace {
NeedsExtraction = false;
for (Value::use_iterator I = S->second->use_begin(),
IE = S->second->use_end(); I != IE; ++I) {
- if (isa<ShuffleVectorInst>(*I) ||
- isa<InsertElementInst>(*I) ||
- isa<ExtractElementInst>(*I))
+ if (ShuffleVectorInst *SI = dyn_cast<ShuffleVectorInst>(*I)) {
+ // Shuffle can be folded if it has no other input
+ if (isa<UndefValue>(SI->getOperand(1)))
+ continue;
+ }
+ if (isa<ExtractElementInst>(*I))
continue;
if (PrunedTreeInstrs.count(*I))
continue;
@@ -1839,14 +1874,37 @@ namespace {
// Combining vector operations of the same type is also assumed
// folded with other operations.
- if (Ty1 == Ty2 &&
- (isa<ShuffleVectorInst>(O1) ||
- isa<InsertElementInst>(O1) ||
- isa<InsertElementInst>(O1)) &&
- (isa<ShuffleVectorInst>(O2) ||
- isa<InsertElementInst>(O2) ||
- isa<InsertElementInst>(O2)))
- continue;
+ if (Ty1 == Ty2) {
+ // If both are insert elements, then both can be widened.
+ InsertElementInst *IEO1 = dyn_cast<InsertElementInst>(O1),
+ *IEO2 = dyn_cast<InsertElementInst>(O2);
+ if (IEO1 && IEO2 && isPureIEChain(IEO1) && isPureIEChain(IEO2))
+ continue;
+ // If both are extract elements, and both have the same input
+ // type, then they can be replaced with a shuffle
+ ExtractElementInst *EIO1 = dyn_cast<ExtractElementInst>(O1),
+ *EIO2 = dyn_cast<ExtractElementInst>(O2);
+ if (EIO1 && EIO2 &&
+ EIO1->getOperand(0)->getType() ==
+ EIO2->getOperand(0)->getType())
+ continue;
+ // If both are a shuffle with equal operand types and only two
+ // unqiue operands, then they can be replaced with a single
+ // shuffle
+ ShuffleVectorInst *SIO1 = dyn_cast<ShuffleVectorInst>(O1),
+ *SIO2 = dyn_cast<ShuffleVectorInst>(O2);
+ if (SIO1 && SIO2 &&
+ SIO1->getOperand(0)->getType() ==
+ SIO2->getOperand(0)->getType()) {
+ SmallSet<Value *, 4> SIOps;
+ SIOps.insert(SIO1->getOperand(0));
+ SIOps.insert(SIO1->getOperand(1));
+ SIOps.insert(SIO2->getOperand(0));
+ SIOps.insert(SIO2->getOperand(1));
+ if (SIOps.size() <= 2)
+ continue;
+ }
+ }
int ESContrib;
// This pair has already been formed.
@@ -1894,6 +1952,13 @@ namespace {
}
}
}
+
+ if (!HasNontrivialInsts) {
+ DEBUG(if (DebugPairSelection) dbgs() <<
+ "\tNo non-trivial instructions in tree;"
+ " override to zero effective size\n");
+ EffSize = 0;
+ }
} else {
for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(),
E = PrunedTree.end(); S != E; ++S)
@@ -2092,18 +2157,7 @@ namespace {
if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) {
// If we have a pure insertelement chain, then this can be rewritten
// into a chain that directly builds the larger type.
- bool PureChain = true;
- InsertElementInst *LIENext = LIE;
- do {
- if (!isa<UndefValue>(LIENext->getOperand(0)) &&
- !isa<InsertElementInst>(LIENext->getOperand(0))) {
- PureChain = false;
- break;
- }
- } while ((LIENext =
- dyn_cast<InsertElementInst>(LIENext->getOperand(0))));
-
- if (PureChain) {
+ if (isPureIEChain(LIE)) {
SmallVector<Value *, 8> VectElemts(numElemL,
UndefValue::get(ArgTypeL->getScalarType()));
InsertElementInst *LIENext = LIE;
diff --git a/test/Transforms/BBVectorize/X86/cmp-types.ll b/test/Transforms/BBVectorize/X86/cmp-types.ll
new file mode 100644
index 0000000000..a4fcbb6048
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/cmp-types.ll
@@ -0,0 +1,16 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+%"struct.btSoftBody" = type { float, float, float*, i8 }
+
+define void @test1(%"struct.btSoftBody"* %n1, %"struct.btSoftBody"* %n2) uwtable align 2 {
+entry:
+ %tobool15 = icmp ne %"struct.btSoftBody"* %n1, null
+ %cond16 = zext i1 %tobool15 to i32
+ %tobool21 = icmp ne %"struct.btSoftBody"* %n2, null
+ %cond22 = zext i1 %tobool21 to i32
+ ret void
+; CHECK: @test1
+}
+
diff --git a/test/Transforms/BBVectorize/X86/sh-rec.ll b/test/Transforms/BBVectorize/X86/sh-rec.ll
new file mode 100644
index 0000000000..1e0492c2a8
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-rec.ll
@@ -0,0 +1,54 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+define void @ptoa() nounwind uwtable {
+entry:
+ %call = call i8* @malloc() nounwind
+ br i1 undef, label %return, label %if.end10
+
+if.end10: ; preds = %entry
+ %incdec.ptr = getelementptr inbounds i8* %call, i64 undef
+ %call17 = call i32 @ptou() nounwind
+ %incdec.ptr26.1 = getelementptr inbounds i8* %incdec.ptr, i64 -2
+ store i8 undef, i8* %incdec.ptr26.1, align 1
+ %div27.1 = udiv i32 %call17, 100
+ %rem.2 = urem i32 %div27.1, 10
+ %add2230.2 = or i32 %rem.2, 48
+ %conv25.2 = trunc i32 %add2230.2 to i8
+ %incdec.ptr26.2 = getelementptr inbounds i8* %incdec.ptr, i64 -3
+ store i8 %conv25.2, i8* %incdec.ptr26.2, align 1
+ %incdec.ptr26.3 = getelementptr inbounds i8* %incdec.ptr, i64 -4
+ store i8 undef, i8* %incdec.ptr26.3, align 1
+ %div27.3 = udiv i32 %call17, 10000
+ %rem.4 = urem i32 %div27.3, 10
+ %add2230.4 = or i32 %rem.4, 48
+ %conv25.4 = trunc i32 %add2230.4 to i8
+ %incdec.ptr26.4 = getelementptr inbounds i8* %incdec.ptr, i64 -5
+ store i8 %conv25.4, i8* %incdec.ptr26.4, align 1
+ %div27.4 = udiv i32 %call17, 100000
+ %rem.5 = urem i32 %div27.4, 10
+ %add2230.5 = or i32 %rem.5, 48
+ %conv25.5 = trunc i32 %add2230.5 to i8
+ %incdec.ptr26.5 = getelementptr inbounds i8* %incdec.ptr, i64 -6
+ store i8 %conv25.5, i8* %incdec.ptr26.5, align 1
+ %incdec.ptr26.6 = getelementptr inbounds i8* %incdec.ptr, i64 -7
+ store i8 0, i8* %incdec.ptr26.6, align 1
+ %incdec.ptr26.7 = getelementptr inbounds i8* %incdec.ptr, i64 -8
+ store i8 undef, i8* %incdec.ptr26.7, align 1
+ %div27.7 = udiv i32 %call17, 100000000
+ %rem.8 = urem i32 %div27.7, 10
+ %add2230.8 = or i32 %rem.8, 48
+ %conv25.8 = trunc i32 %add2230.8 to i8
+ %incdec.ptr26.8 = getelementptr inbounds i8* %incdec.ptr, i64 -9
+ store i8 %conv25.8, i8* %incdec.ptr26.8, align 1
+ unreachable
+
+return: ; preds = %entry
+ ret void
+; CHECK: @ptoa
+}
+
+declare noalias i8* @malloc() nounwind
+
+declare i32 @ptou()
diff --git a/test/Transforms/BBVectorize/X86/sh-rec2.ll b/test/Transforms/BBVectorize/X86/sh-rec2.ll
new file mode 100644
index 0000000000..ef2239932f
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-rec2.ll
@@ -0,0 +1,85 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -basicaa -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352 = type { [280 x i16], i16, i64, i32, [8 x i16], [2 x [8 x i16]], i16, i16, [9 x i16], i16, i8, i8 }
+
+define void @gsm_encode(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352* %s, i16* %source, i8* %c) nounwind uwtable {
+entry:
+ %xmc = alloca [52 x i16], align 16
+ %arraydecay5 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 0
+ call void @Gsm_Coder(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352* %s, i16* %source, i16* undef, i16* null, i16* undef, i16* undef, i16* undef, i16* %arraydecay5) nounwind
+ %incdec.ptr136 = getelementptr inbounds i8* %c, i64 10
+ %incdec.ptr157 = getelementptr inbounds i8* %c, i64 11
+ store i8 0, i8* %incdec.ptr136, align 1
+ %arrayidx162 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 11
+ %0 = load i16* %arrayidx162, align 2
+ %conv1631 = trunc i16 %0 to i8
+ %and164 = shl i8 %conv1631, 3
+ %shl165 = and i8 %and164, 56
+ %incdec.ptr172 = getelementptr inbounds i8* %c, i64 12
+ store i8 %shl165, i8* %incdec.ptr157, align 1
+ %1 = load i16* inttoptr (i64 2 to i16*), align 2
+ %conv1742 = trunc i16 %1 to i8
+ %and175 = shl i8 %conv1742, 1
+ %incdec.ptr183 = getelementptr inbounds i8* %c, i64 13
+ store i8 %and175, i8* %incdec.ptr172, align 1
+ %incdec.ptr199 = getelementptr inbounds i8* %c, i64 14
+ store i8 0, i8* %incdec.ptr183, align 1
+ %arrayidx214 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 15
+ %incdec.ptr220 = getelementptr inbounds i8* %c, i64 15
+ store i8 0, i8* %incdec.ptr199, align 1
+ %2 = load i16* %arrayidx214, align 2
+ %conv2223 = trunc i16 %2 to i8
+ %and223 = shl i8 %conv2223, 6
+ %incdec.ptr235 = getelementptr inbounds i8* %c, i64 16
+ store i8 %and223, i8* %incdec.ptr220, align 1
+ %arrayidx240 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 19
+ %3 = load i16* %arrayidx240, align 2
+ %conv2414 = trunc i16 %3 to i8
+ %and242 = shl i8 %conv2414, 2
+ %shl243 = and i8 %and242, 28
+ %incdec.ptr251 = getelementptr inbounds i8* %c, i64 17
+ store i8 %shl243, i8* %incdec.ptr235, align 1
+ %incdec.ptr272 = getelementptr inbounds i8* %c, i64 18
+ store i8 0, i8* %incdec.ptr251, align 1
+ %arrayidx282 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 25
+ %4 = load i16* %arrayidx282, align 2
+ %conv2835 = trunc i16 %4 to i8
+ %and284 = and i8 %conv2835, 7
+ %incdec.ptr287 = getelementptr inbounds i8* %c, i64 19
+ store i8 %and284, i8* %incdec.ptr272, align 1
+ %incdec.ptr298 = getelementptr inbounds i8* %c, i64 20
+ store i8 0, i8* %incdec.ptr287, align 1
+ %incdec.ptr314 = getelementptr inbounds i8* %c, i64 21
+ store i8 0, i8* %incdec.ptr298, align 1
+ %arrayidx319 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 26
+ %5 = load i16* %arrayidx319, align 4
+ %conv3206 = trunc i16 %5 to i8
+ %and321 = shl i8 %conv3206, 4
+ %shl322 = and i8 %and321, 112
+ %incdec.ptr335 = getelementptr inbounds i8* %c, i64 22
+ store i8 %shl322, i8* %incdec.ptr314, align 1
+ %arrayidx340 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 29
+ %6 = load i16* %arrayidx340, align 2
+ %conv3417 = trunc i16 %6 to i8
+ %and342 = shl i8 %conv3417, 3
+ %shl343 = and i8 %and342, 56
+ %incdec.ptr350 = getelementptr inbounds i8* %c, i64 23
+ store i8 %shl343, i8* %incdec.ptr335, align 1
+ %incdec.ptr366 = getelementptr inbounds i8* %c, i64 24
+ store i8 0, i8* %incdec.ptr350, align 1
+ %arrayidx381 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 36
+ %incdec.ptr387 = getelementptr inbounds i8* %c, i64 25
+ store i8 0, i8* %incdec.ptr366, align 1
+ %7 = load i16* %arrayidx381, align 8
+ %conv3898 = trunc i16 %7 to i8
+ %and390 = shl i8 %conv3898, 6
+ store i8 %and390, i8* %incdec.ptr387, align 1
+ unreachable
+; CHECK: @gsm_encode
+}
+
+declare void @Gsm_Coder(%struct.gsm_state.2.8.14.15.16.17.19.22.23.25.26.28.29.31.32.33.35.36.37.38.40.41.42.44.45.47.48.50.52.53.54.56.57.58.59.60.61.62.63.66.73.83.84.89.90.91.92.93.94.95.96.99.100.101.102.103.104.106.107.114.116.121.122.129.130.135.136.137.138.139.140.141.142.143.144.147.148.149.158.159.160.161.164.165.166.167.168.169.172.179.181.182.183.188.195.200.201.202.203.204.205.208.209.210.212.213.214.215.222.223.225.226.230.231.232.233.234.235.236.237.238.239.240.241.242.243.244.352*, i16*, i16*, i16*, i16*, i16*, i16*, i16*)
+
+declare void @llvm.trap() noreturn nounwind
diff --git a/test/Transforms/BBVectorize/X86/sh-rec3.ll b/test/Transforms/BBVectorize/X86/sh-rec3.ll
new file mode 100644
index 0000000000..fd2cc8bdd9
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-rec3.ll
@@ -0,0 +1,170 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; RUN: opt < %s -basicaa -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565 = type { [280 x i16], i16, i64, i32, [8 x i16], [2 x [8 x i16]], i16, i16, [9 x i16], i16, i8, i8 }
+
+define void @gsm_encode(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565* %s, i16* %source, i8* %c) nounwind uwtable {
+entry:
+ %LARc28 = alloca [2 x i64], align 16
+ %LARc28.sub = getelementptr inbounds [2 x i64]* %LARc28, i64 0, i64 0
+ %tmpcast = bitcast [2 x i64]* %LARc28 to [8 x i16]*
+ %Nc = alloca [4 x i16], align 2
+ %Mc = alloca [4 x i16], align 2
+ %bc = alloca [4 x i16], align 2
+ %xmc = alloca [52 x i16], align 16
+ %arraydecay = bitcast [2 x i64]* %LARc28 to i16*
+ %arraydecay1 = getelementptr inbounds [4 x i16]* %Nc, i64 0, i64 0
+ %arraydecay2 = getelementptr inbounds [4 x i16]* %bc, i64 0, i64 0
+ %arraydecay3 = getelementptr inbounds [4 x i16]* %Mc, i64 0, i64 0
+ %arraydecay5 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 0
+ call void @Gsm_Coder(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565* %s, i16* %source, i16* %arraydecay, i16* %arraydecay1, i16* %arraydecay2, i16* %arraydecay3, i16* undef, i16* %arraydecay5) nounwind
+ %0 = load i64* %LARc28.sub, align 16
+ %1 = trunc i64 %0 to i32
+ %conv1 = lshr i32 %1, 2
+ %and = and i32 %conv1, 15
+ %or = or i32 %and, 208
+ %conv6 = trunc i32 %or to i8
+ %incdec.ptr = getelementptr inbounds i8* %c, i64 1
+ store i8 %conv6, i8* %c, align 1
+ %conv84 = trunc i64 %0 to i8
+ %and9 = shl i8 %conv84, 6
+ %incdec.ptr15 = getelementptr inbounds i8* %c, i64 2
+ store i8 %and9, i8* %incdec.ptr, align 1
+ %2 = lshr i64 %0, 50
+ %shr226.tr = trunc i64 %2 to i8
+ %conv25 = and i8 %shr226.tr, 7
+ %incdec.ptr26 = getelementptr inbounds i8* %c, i64 3
+ store i8 %conv25, i8* %incdec.ptr15, align 1
+ %incdec.ptr42 = getelementptr inbounds i8* %c, i64 4
+ store i8 0, i8* %incdec.ptr26, align 1
+ %arrayidx52 = getelementptr inbounds [8 x i16]* %tmpcast, i64 0, i64 7
+ %3 = load i16* %arrayidx52, align 2
+ %conv537 = trunc i16 %3 to i8
+ %and54 = and i8 %conv537, 7
+ %incdec.ptr57 = getelementptr inbounds i8* %c, i64 5
+ store i8 %and54, i8* %incdec.ptr42, align 1
+ %incdec.ptr68 = getelementptr inbounds i8* %c, i64 6
+ store i8 0, i8* %incdec.ptr57, align 1
+ %4 = load i16* %arraydecay3, align 2
+ %conv748 = trunc i16 %4 to i8
+ %and75 = shl i8 %conv748, 5
+ %shl76 = and i8 %and75, 96
+ %incdec.ptr84 = getelementptr inbounds i8* %c, i64 7
+ store i8 %shl76, i8* %incdec.ptr68, align 1
+ %arrayidx94 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 1
+ %5 = load i16* %arrayidx94, align 2
+ %conv959 = trunc i16 %5 to i8
+ %and96 = shl i8 %conv959, 1
+ %shl97 = and i8 %and96, 14
+ %or103 = or i8 %shl97, 1
+ %incdec.ptr105 = getelementptr inbounds i8* %c, i64 8
+ store i8 %or103, i8* %incdec.ptr84, align 1
+ %arrayidx115 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 4
+ %6 = bitcast i16* %arrayidx115 to i32*
+ %7 = load i32* %6, align 8
+ %conv11610 = trunc i32 %7 to i8
+ %and117 = and i8 %conv11610, 7
+ %incdec.ptr120 = getelementptr inbounds i8* %c, i64 9
+ store i8 %and117, i8* %incdec.ptr105, align 1
+ %8 = lshr i32 %7, 16
+ %and12330 = shl nuw nsw i32 %8, 5
+ %and123 = trunc i32 %and12330 to i8
+ %incdec.ptr136 = getelementptr inbounds i8* %c, i64 10
+ store i8 %and123, i8* %incdec.ptr120, align 1
+ %incdec.ptr157 = getelementptr inbounds i8* %c, i64 11
+ store i8 0, i8* %incdec.ptr136, align 1
+ %incdec.ptr172 = getelementptr inbounds i8* %c, i64 12
+ store i8 0, i8* %incdec.ptr157, align 1
+ %arrayidx173 = getelementptr inbounds [4 x i16]* %Nc, i64 0, i64 1
+ %9 = load i16* %arrayidx173, align 2
+ %conv17412 = zext i16 %9 to i32
+ %and175 = shl nuw nsw i32 %conv17412, 1
+ %arrayidx177 = getelementptr inbounds [4 x i16]* %bc, i64 0, i64 1
+ %10 = load i16* %arrayidx177, align 2
+ %conv17826 = zext i16 %10 to i32
+ %shr17913 = lshr i32 %conv17826, 1
+ %and180 = and i32 %shr17913, 1
+ %or181 = or i32 %and175, %and180
+ %conv182 = trunc i32 %or181 to i8
+ %incdec.ptr183 = getelementptr inbounds i8* %c, i64 13
+ store i8 %conv182, i8* %incdec.ptr172, align 1
+ %arrayidx188 = getelementptr inbounds [4 x i16]* %Mc, i64 0, i64 1
+ %11 = load i16* %arrayidx188, align 2
+ %conv18914 = trunc i16 %11 to i8
+ %and190 = shl i8 %conv18914, 5
+ %shl191 = and i8 %and190, 96
+ %incdec.ptr199 = getelementptr inbounds i8* %c, i64 14
+ store i8 %shl191, i8* %incdec.ptr183, align 1
+ %arrayidx209 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 14
+ %12 = load i16* %arrayidx209, align 4
+ %conv21015 = trunc i16 %12 to i8
+ %and211 = shl i8 %conv21015, 1
+ %shl212 = and i8 %and211, 14
+ %or218 = or i8 %shl212, 1
+ %incdec.ptr220 = getelementptr inbounds i8* %c, i64 15
+ store i8 %or218, i8* %incdec.ptr199, align 1
+ %arrayidx225 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 16
+ %13 = bitcast i16* %arrayidx225 to i64*
+ %14 = load i64* %13, align 16
+ %conv22616 = trunc i64 %14 to i8
+ %and227 = shl i8 %conv22616, 3
+ %shl228 = and i8 %and227, 56
+ %incdec.ptr235 = getelementptr inbounds i8* %c, i64 16
+ store i8 %shl228, i8* %incdec.ptr220, align 1
+ %15 = lshr i64 %14, 32
+ %and23832 = shl nuw nsw i64 %15, 5
+ %and238 = trunc i64 %and23832 to i8
+ %incdec.ptr251 = getelementptr inbounds i8* %c, i64 17
+ store i8 %and238, i8* %incdec.ptr235, align 1
+ %arrayidx266 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 23
+ %incdec.ptr272 = getelementptr inbounds i8* %c, i64 18
+ store i8 0, i8* %incdec.ptr251, align 1
+ %16 = load i16* %arrayidx266, align 2
+ %conv27418 = trunc i16 %16 to i8
+ %and275 = shl i8 %conv27418, 6
+ %incdec.ptr287 = getelementptr inbounds i8* %c, i64 19
+ store i8 %and275, i8* %incdec.ptr272, align 1
+ %arrayidx288 = getelementptr inbounds [4 x i16]* %Nc, i64 0, i64 2
+ %17 = load i16* %arrayidx288, align 2
+ %conv28919 = zext i16 %17 to i32
+ %and290 = shl nuw nsw i32 %conv28919, 1
+ %arrayidx292 = getelementptr inbounds [4 x i16]* %bc, i64 0, i64 2
+ %18 = load i16* %arrayidx292, align 2
+ %conv29327 = zext i16 %18 to i32
+ %shr29420 = lshr i32 %conv29327, 1
+ %and295 = and i32 %shr29420, 1
+ %or296 = or i32 %and290, %and295
+ %conv297 = trunc i32 %or296 to i8
+ %incdec.ptr298 = getelementptr inbounds i8* %c, i64 20
+ store i8 %conv297, i8* %incdec.ptr287, align 1
+ %conv30021 = trunc i16 %18 to i8
+ %and301 = shl i8 %conv30021, 7
+ %incdec.ptr314 = getelementptr inbounds i8* %c, i64 21
+ store i8 %and301, i8* %incdec.ptr298, align 1
+ %incdec.ptr335 = getelementptr inbounds i8* %c, i64 22
+ store i8 0, i8* %incdec.ptr314, align 1
+ %arrayidx340 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 29
+ %19 = load i16* %arrayidx340, align 2
+ %conv34122 = trunc i16 %19 to i8
+ %and342 = shl i8 %conv34122, 3
+ %shl343 = and i8 %and342, 56
+ %incdec.ptr350 = getelementptr inbounds i8* %c, i64 23
+ store i8 %shl343, i8* %incdec.ptr335, align 1
+ %arrayidx355 = getelementptr inbounds [52 x i16]* %xmc, i64 0, i64 32
+ %20 = bitcast i16* %arrayidx355 to i32*
+ %21 = load i32* %20, align 16
+ %conv35623 = shl i32 %21, 2
+ %shl358 = and i32 %conv35623, 28
+ %22 = lshr i32 %21, 17
+ %and363 = and i32 %22, 3
+ %or364 = or i32 %shl358, %and363
+ %conv365 = trunc i32 %or364 to i8
+ store i8 %conv365, i8* %incdec.ptr350, align 1
+ unreachable
+; CHECK: @gsm_encode
+}
+
+declare void @Gsm_Coder(%struct.gsm_state.2.8.39.44.45.55.56.57.58.59.62.63.64.65.74.75.76.77.80.87.92.93.94.95.96.97.110.111.112.113.114.128.130.135.136.137.138.139.140.141.142.143.144.145.148.149.150.151.152.169.170.177.178.179.184.185.186.187.188.201.208.209.219.220.221.223.224.225.230.231.232.233.235.236.237.238.245.246.248.249.272.274.279.280.281.282.283.286.293.298.299.314.315.316.317.318.319.320.321.322.323.324.325.326.327.328.329.330.331.332.333.334.335.336.337.338.339.340.341.342.343.344.345.346.347.348.349.350.351.352.353.565*, i16*, i16*, i16*, i16*, i16*, i16*, i16*)
+
+declare void @llvm.trap() noreturn nounwind
diff --git a/test/Transforms/BBVectorize/X86/sh-types.ll b/test/Transforms/BBVectorize/X86/sh-types.ll
new file mode 100644
index 0000000000..0bcb714d5e
--- /dev/null
+++ b/test/Transforms/BBVectorize/X86/sh-types.ll
@@ -0,0 +1,25 @@
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -bb-vectorize -S | FileCheck %s
+
+define <4 x float> @test7(<4 x float> %A1, <4 x float> %B1, double %C1, double %C2, double %D1, double %D2) {
+ %A2 = shufflevector <4 x float> %A1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+ %B2 = shufflevector <4 x float> %B1, <4 x float> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 3>
+ %X1 = shufflevector <4 x float> %A2, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+ %X2 = shufflevector <4 x float> %B2, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+ %Y1 = shufflevector <2 x float> %X1, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+ %Y2 = shufflevector <2 x float> %X2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+
+ %M1 = fsub double %C1, %D1
+ %M2 = fsub double %C2, %D2
+ %N1 = fmul double %M1, %C1
+ %N2 = fmul double %M2, %C2
+ %Z1 = fadd double %N1, %D1
+ %Z2 = fadd double %N2, %D2
+
+ %R = fmul <4 x float> %Y1, %Y2
+ ret <4 x float> %R
+; CHECK: @test7
+; CHECK-NOT: <8 x float>
+; CHECK: ret <4 x float>
+}
+