23 files changed, 539 insertions, 213 deletions
diff --git a/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll b/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
index c39b82a1fe..256cbbb5d9 100644
--- a/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
+++ b/test/CodeGen/X86/2007-04-25-MMX-PADDQ.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -o - -march=x86 -mattr=+mmx | FileCheck %s
+; There are no MMX instructions here.  We use add+adcl for the adds.
 
 define <1 x i64> @unsigned_add3(<1 x i64>* %a, <1 x i64>* %b, i32 %count) nounwind {
 entry:
@@ -7,9 +8,8 @@ entry:
 
 bb26:		; preds = %bb26, %entry
 
-; CHECK:  movq	({{.*}},8), %mm
-; CHECK:  paddq	({{.*}},8), %mm
-; CHECK:  paddq	%mm{{[0-7]}}, %mm
+; CHECK:  addl  %eax, %ebx
+; CHECK:  adcl  %edx, %ebp
 
 	%i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ]		; <i32> [#uses=3]
 	%sum.035.0 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ]		; <<1 x i64>> [#uses=1]
@@ -27,3 +27,38 @@ bb31:		; preds = %bb26, %entry
 	%sum.035.1 = phi <1 x i64> [ zeroinitializer, %entry ], [ %tmp22, %bb26 ]		; <<1 x i64>> [#uses=1]
 	ret <1 x i64> %sum.035.1
 }
+
+
+; This is the original test converted to use MMX intrinsics.
+
+define <1 x i64> @unsigned_add3a(x86_mmx* %a, x86_mmx* %b, i32 %count) nounwind {
+entry:
+        %tmp2943 = bitcast <1 x i64><i64 0> to x86_mmx
+	%tmp2942 = icmp eq i32 %count, 0		; <i1> [#uses=1]
+	br i1 %tmp2942, label %bb31, label %bb26
+
+bb26:		; preds = %bb26, %entry
+
+; CHECK:  movq	({{.*}},8), %mm
+; CHECK:  paddq	({{.*}},8), %mm
+; CHECK:  paddq	%mm{{[0-7]}}, %mm
+
+	%i.037.0 = phi i32 [ 0, %entry ], [ %tmp25, %bb26 ]		; <i32> [#uses=3]
+	%sum.035.0 = phi x86_mmx [ %tmp2943, %entry ], [ %tmp22, %bb26 ]		; <x86_mmx> [#uses=1]
+	%tmp13 = getelementptr x86_mmx* %b, i32 %i.037.0		; <x86_mmx*> [#uses=1]
+	%tmp14 = load x86_mmx* %tmp13		; <x86_mmx> [#uses=1]
+	%tmp18 = getelementptr x86_mmx* %a, i32 %i.037.0		; <x86_mmx*> [#uses=1]
+	%tmp19 = load x86_mmx* %tmp18		; <x86_mmx> [#uses=1]
+	%tmp21 = call x86_mmx @llvm.x86.mmx.padd.q (x86_mmx %tmp19, x86_mmx %tmp14)		; <x86_mmx> [#uses=1]
+	%tmp22 = call x86_mmx @llvm.x86.mmx.padd.q (x86_mmx %tmp21, x86_mmx %sum.035.0)		; <x86_mmx> [#uses=2]
+	%tmp25 = add i32 %i.037.0, 1		; <i32> [#uses=2]
+	%tmp29 = icmp ult i32 %tmp25, %count		; <i1> [#uses=1]
+	br i1 %tmp29, label %bb26, label %bb31
+
+bb31:		; preds = %bb26, %entry
+	%sum.035.1 = phi x86_mmx [ %tmp2943, %entry ], [ %tmp22, %bb26 ]		; <x86_mmx> [#uses=1]
+        %t = bitcast x86_mmx %sum.035.1 to <1 x i64>
+	ret <1 x i64> %t
+}
+
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
diff --git a/test/CodeGen/X86/2007-05-15-maskmovq.ll b/test/CodeGen/X86/2007-05-15-maskmovq.ll
index 2093b8f687..006cf2e43a 100644
--- a/test/CodeGen/X86/2007-05-15-maskmovq.ll
+++ b/test/CodeGen/X86/2007-05-15-maskmovq.ll
@@ -5,10 +5,10 @@ target triple = "i686-apple-darwin8"
 
 define void @test(<1 x i64> %c64, <1 x i64> %mask1, i8* %P) {
 entry:
-	%tmp4 = bitcast <1 x i64> %mask1 to <8 x i8>		; <<8 x i8>> [#uses=1]
-	%tmp6 = bitcast <1 x i64> %c64 to <8 x i8>		; <<8 x i8>> [#uses=1]
-	tail call void @llvm.x86.mmx.maskmovq( <8 x i8> %tmp6, <8 x i8> %tmp4, i8* %P )
+	%tmp4 = bitcast <1 x i64> %mask1 to x86_mmx		; <x86_mmx> [#uses=1]
+	%tmp6 = bitcast <1 x i64> %c64 to x86_mmx		; <x86_mmx> [#uses=1]
+	tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp4, x86_mmx %tmp6, i8* %P )
 	ret void
 }
 
-declare void @llvm.x86.mmx.maskmovq(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*)
diff --git a/test/CodeGen/X86/2007-06-15-IntToMMX.ll b/test/CodeGen/X86/2007-06-15-IntToMMX.ll
index 6128d8b92d..660d4fe7b1 100644
--- a/test/CodeGen/X86/2007-06-15-IntToMMX.ll
+++ b/test/CodeGen/X86/2007-06-15-IntToMMX.ll
@@ -1,17 +1,16 @@
 ; RUN: llc < %s -march=x86-64 -mattr=+mmx | grep paddusw
-@R = external global <1 x i64>          ; <<1 x i64>*> [#uses=1]
+@R = external global x86_mmx          ; <x86_mmx*> [#uses=1]
 
 define void @foo(<1 x i64> %A, <1 x i64> %B) {
 entry:
-        %tmp4 = bitcast <1 x i64> %B to <4 x i16>               ; <<4 x i16>> [#uses=1]
-        %tmp6 = bitcast <1 x i64> %A to <4 x i16>               ; <<4 x i16>> [#uses=1]
-        %tmp7 = tail call <4 x i16> @llvm.x86.mmx.paddus.w( <4 x i16> %tmp6, <4 x i16> %tmp4 )   ; <<4 x i16>> [#uses=1]
-        %tmp8 = bitcast <4 x i16> %tmp7 to <1 x i64>            ; <<1 x i64>> [#uses=1]
-        store <1 x i64> %tmp8, <1 x i64>* @R
+        %tmp2 = bitcast <1 x i64> %A to x86_mmx
+        %tmp3 = bitcast <1 x i64> %B to x86_mmx
+        %tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp2, x86_mmx %tmp3 )   ; <x86_mmx> [#uses=1]
+        store x86_mmx %tmp7, x86_mmx* @R
         tail call void @llvm.x86.mmx.emms( )
         ret void
 }
 
-declare <4 x i16> @llvm.x86.mmx.paddus.w(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
 
 declare void @llvm.x86.mmx.emms()
diff --git a/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll b/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
index 2c513f1781..1c5e6766fd 100644
--- a/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
+++ b/test/CodeGen/X86/2007-07-03-GR64ToVR64.ll
@@ -2,19 +2,17 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx | grep {movd	%rdi, %mm1}
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx | grep {paddusw	%mm0, %mm1}
 
-@R = external global <1 x i64>		; <<1 x i64>*> [#uses=1]
+@R = external global x86_mmx		; <x86_mmx*> [#uses=1]
 
 define void @foo(<1 x i64> %A, <1 x i64> %B) nounwind {
 entry:
-	%tmp4 = bitcast <1 x i64> %B to <4 x i16>		; <<4 x i16>> [#uses=1]
-	%tmp6 = bitcast <1 x i64> %A to <4 x i16>		; <<4 x i16>> [#uses=1]
-	%tmp7 = tail call <4 x i16> @llvm.x86.mmx.paddus.w( <4 x i16> %tmp6, <4 x i16> %tmp4 )		; <<4 x i16>> [#uses=1]
-	%tmp8 = bitcast <4 x i16> %tmp7 to <1 x i64>		; <<1 x i64>> [#uses=1]
-	store <1 x i64> %tmp8, <1 x i64>* @R
+	%tmp4 = bitcast <1 x i64> %B to x86_mmx		; <<4 x i16>> [#uses=1]
+	%tmp6 = bitcast <1 x i64> %A to x86_mmx		; <<4 x i16>> [#uses=1]
+	%tmp7 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp6, x86_mmx %tmp4 )		; <x86_mmx> [#uses=1]
+	store x86_mmx %tmp7, x86_mmx* @R
 	tail call void @llvm.x86.mmx.emms( )
 	ret void
 }
 
-declare <4 x i16> @llvm.x86.mmx.paddus.w(<4 x i16>, <4 x i16>)
-
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
 declare void @llvm.x86.mmx.emms()
diff --git a/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll b/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
index dc8c097efc..5089e8c5b6 100644
--- a/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
+++ b/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
@@ -5,15 +5,15 @@ entry:
 	tail call void asm sideeffect "# top of block", "~{dirflag},~{fpsr},~{flags},~{di},~{si},~{dx},~{cx},~{ax}"( ) nounwind 
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 8", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	%tmp1 = tail call <2 x i32> asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind 		; <<2 x i32>> [#uses=1]
+	%tmp1 = tail call x86_mmx asm sideeffect "movd $1, $0", "=={mm4},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( i32 undef ) nounwind 		; <x86_mmx> [#uses=1]
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 9", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	%tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( <2 x i32> undef ) nounwind 		; <i32> [#uses=1]
+	%tmp3 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm3},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef ) nounwind 		; <i32> [#uses=1]
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 10", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( <2 x i32> undef, i32 undef, i32 %tmp3 ) nounwind 
+	tail call void asm sideeffect "movntq $0, 0($1,$2)", "{mm0},{di},{bp},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx undef, i32 undef, i32 %tmp3 ) nounwind 
 	tail call void asm sideeffect ".file \224443946.c\22", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
 	tail call void asm sideeffect ".line 11", "~{dirflag},~{fpsr},~{flags}"( ) nounwind 
-	%tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( <2 x i32> %tmp1 ) nounwind 		; <i32> [#uses=0]
+	%tmp8 = tail call i32 asm sideeffect "movd $1, $0", "=={bp},{mm4},~{dirflag},~{fpsr},~{flags},~{memory}"( x86_mmx %tmp1 ) nounwind 		; <i32> [#uses=0]
 	ret i32 undef
 }
diff --git a/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll b/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
index c76dd7de12..53402c0451 100644
--- a/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
+++ b/test/CodeGen/X86/2008-08-23-64Bit-maskmovq.ll
@@ -17,11 +17,13 @@ entry:
 	br i1 false, label %bb.nph144.split, label %bb133
 
 bb.nph144.split:		; preds = %entry
-	tail call void @llvm.x86.mmx.maskmovq( <8 x i8> zeroinitializer, <8 x i8> zeroinitializer, i8* null ) nounwind
+        %tmp = bitcast <8 x i8> zeroinitializer to x86_mmx
+        %tmp2 = bitcast <8 x i8> zeroinitializer to x86_mmx
+	tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp, x86_mmx %tmp2, i8* null ) nounwind
 	unreachable
 
 bb133:		; preds = %entry
 	ret void
 }
 
-declare void @llvm.x86.mmx.maskmovq(<8 x i8>, <8 x i8>, i8*) nounwind
+declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*) nounwind
diff --git a/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll b/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
index 60be0d51e7..2dc1deaf17 100644
--- a/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
+++ b/test/CodeGen/X86/2008-09-05-sinttofp-2xi32.ll
@@ -1,6 +1,9 @@
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mattr=+mmx | grep unpcklpd
+; RUN: llc < %s -march=x86 -mattr=+sse2 -mattr=+mmx | grep unpckhpd
 ; RUN: llc < %s -march=x86 -mattr=+sse2 | grep cvttpd2pi | count 1
 ; RUN: llc < %s -march=x86 -mattr=+sse2 | grep cvtpi2pd | count 1
-; PR2687
+; originally from PR2687, but things don't work that way any more.
+; there are no MMX instructions here; we use XMM.
 
 define <2 x double> @a(<2 x i32> %x) nounwind {
 entry:
@@ -13,3 +16,20 @@ entry:
   %y = fptosi <2 x double> %x to <2 x i32>
   ret <2 x i32> %y
 }
+
+; This is how to get MMX instructions.
+
+define <2 x double> @a2(x86_mmx %x) nounwind {
+entry:
+  %y = tail call <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx %x)
+  ret <2 x double> %y
+}
+
+define x86_mmx @b2(<2 x double> %x) nounwind {
+entry:
+  %y = tail call x86_mmx @llvm.x86.sse.cvttpd2pi (<2 x double> %x)
+  ret x86_mmx %y
+}
+
+declare <2 x double> @llvm.x86.sse.cvtpi2pd(x86_mmx)
+declare x86_mmx @llvm.x86.sse.cvttpd2pi(<2 x double>)
diff --git a/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll b/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll
index b9b09a3f00..288eef4f69 100644
--- a/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll
+++ b/test/CodeGen/X86/2009-08-02-mmx-scalar-to-vector.ll
@@ -1,10 +1,12 @@
 ; RUN: llc < %s -march=x86-64
 ; PR4669
-declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32)
+declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
 
 define <1 x i64> @test(i64 %t) {
 entry:
 	%t1 = insertelement <1 x i64> undef, i64 %t, i32 0
-	%t2 = tail call <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64> %t1, i32 48)
-	ret <1 x i64> %t2
+        %t0 = bitcast <1 x i64> %t1 to x86_mmx
+	%t2 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %t0, i32 48)
+        %t3 = bitcast x86_mmx %t2 to <1 x i64>
+	ret <1 x i64> %t3
 }
diff --git a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
index 4cd3be35e8..fa3d5fbcdc 100644
--- a/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
+++ b/test/CodeGen/X86/2010-04-23-mmx-movdq2q.ll
@@ -1,12 +1,12 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | FileCheck %s
+; There are no MMX operations here, so we use XMM or i64.
 
 define void @ti8(double %a, double %b) nounwind {
 entry:
         %tmp1 = bitcast double %a to <8 x i8>
-; CHECK: movdq2q
         %tmp2 = bitcast double %b to <8 x i8>
-; CHECK: movdq2q
         %tmp3 = add <8 x i8> %tmp1, %tmp2
+; CHECK:  paddb %xmm1, %xmm0
         store <8 x i8> %tmp3, <8 x i8>* null
         ret void
 }
@@ -14,10 +14,9 @@ entry:
 define void @ti16(double %a, double %b) nounwind {
 entry:
         %tmp1 = bitcast double %a to <4 x i16>
-; CHECK: movdq2q
         %tmp2 = bitcast double %b to <4 x i16>
-; CHECK: movdq2q
         %tmp3 = add <4 x i16> %tmp1, %tmp2
+; CHECK:  paddw %xmm1, %xmm0
         store <4 x i16> %tmp3, <4 x i16>* null
         ret void
 }
@@ -25,10 +24,9 @@ entry:
 define void @ti32(double %a, double %b) nounwind {
 entry:
         %tmp1 = bitcast double %a to <2 x i32>
-; CHECK: movdq2q
         %tmp2 = bitcast double %b to <2 x i32>
-; CHECK: movdq2q
         %tmp3 = add <2 x i32> %tmp1, %tmp2
+; CHECK:  paddd %xmm1, %xmm0
         store <2 x i32> %tmp3, <2 x i32>* null
         ret void
 }
@@ -36,10 +34,60 @@ entry:
 define void @ti64(double %a, double %b) nounwind {
 entry:
         %tmp1 = bitcast double %a to <1 x i64>
-; CHECK: movdq2q
         %tmp2 = bitcast double %b to <1 x i64>
-; CHECK: movdq2q
         %tmp3 = add <1 x i64> %tmp1, %tmp2
+; CHECK:  addq  %rax, %rcx
         store <1 x i64> %tmp3, <1 x i64>* null
         ret void
 }
+
+; MMX intrinsics calls get us MMX instructions.
+
+define void @ti8a(double %a, double %b) nounwind {
+entry:
+        %tmp1 = bitcast double %a to x86_mmx
+; CHECK: movdq2q
+        %tmp2 = bitcast double %b to x86_mmx
+; CHECK: movdq2q
+        %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %tmp1, x86_mmx %tmp2)
+        store x86_mmx %tmp3, x86_mmx* null
+        ret void
+}
+
+define void @ti16a(double %a, double %b) nounwind {
+entry:
+        %tmp1 = bitcast double %a to x86_mmx
+; CHECK: movdq2q
+        %tmp2 = bitcast double %b to x86_mmx
+; CHECK: movdq2q
+        %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %tmp1, x86_mmx %tmp2)
+        store x86_mmx %tmp3, x86_mmx* null
+        ret void
+}
+
+define void @ti32a(double %a, double %b) nounwind {
+entry:
+        %tmp1 = bitcast double %a to x86_mmx
+; CHECK: movdq2q
+        %tmp2 = bitcast double %b to x86_mmx
+; CHECK: movdq2q
+        %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %tmp1, x86_mmx %tmp2)
+        store x86_mmx %tmp3, x86_mmx* null
+        ret void
+}
+
+define void @ti64a(double %a, double %b) nounwind {
+entry:
+        %tmp1 = bitcast double %a to x86_mmx
+; CHECK: movdq2q
+        %tmp2 = bitcast double %b to x86_mmx
+; CHECK: movdq2q
+        %tmp3 = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %tmp1, x86_mmx %tmp2)
+        store x86_mmx %tmp3, x86_mmx* null
+        ret void
+}
+ 
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
diff --git a/test/CodeGen/X86/fast-isel-bc.ll b/test/CodeGen/X86/fast-isel-bc.ll
index 8d7dc8f9a7..db846889d8 100644
--- a/test/CodeGen/X86/fast-isel-bc.ll
+++ b/test/CodeGen/X86/fast-isel-bc.ll
@@ -5,15 +5,19 @@ target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin9.8"
 
-declare void @func2(<1 x i64>)
+declare void @func2(x86_mmx)
 
 define void @func1() nounwind {
 
 ; This isn't spectacular, but it's MMX code at -O0...
-; CHECK: movl $2, %eax
-; CHECK: movd %rax, %mm0
-; CHECK: movd %mm0, %rdi
+; CHECK:  movq2dq %mm0, %xmm0
+; For now, handling of x86_mmx parameters in fast Isel is unimplemented,
+; so we get pretty poor code.  The below is preferable.
+; CHEK: movl $2, %eax
+; CHEK: movd %rax, %mm0
+; CHEK: movd %mm0, %rdi
 
-        call void @func2(<1 x i64> <i64 2>)
+        %tmp0 = bitcast <2 x i32><i32 0, i32 2> to x86_mmx
+        call void @func2(x86_mmx %tmp0)
         ret void
 }
diff --git a/test/CodeGen/X86/mmx-arg-passing.ll b/test/CodeGen/X86/mmx-arg-passing.ll
index 426e98e019..b348512b57 100644
--- a/test/CodeGen/X86/mmx-arg-passing.ll
+++ b/test/CodeGen/X86/mmx-arg-passing.ll
@@ -1,24 +1,27 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | grep mm0 | count 3
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | grep esp | count 1
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | grep mm0 | count 1
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+mmx | grep esp | count 2
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | grep xmm0
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | grep rdi
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | not grep movups
 ;
 ; On Darwin x86-32, v8i8, v4i16, v2i32 values are passed in MM[0-2].
-; On Darwin x86-32, v1i64 values are passed in memory.
+; On Darwin x86-32, v1i64 values are passed in memory.  In this example, they
+;                   are never moved into an MM register at all.
 ; On Darwin x86-64, v8i8, v4i16, v2i32 values are passed in XMM[0-7].
 ; On Darwin x86-64, v1i64 values are passed in 64-bit GPRs.
 
-@u1 = external global <8 x i8>
+@u1 = external global x86_mmx
 
-define void @t1(<8 x i8> %v1) nounwind  {
-	store <8 x i8> %v1, <8 x i8>* @u1, align 8
+define void @t1(x86_mmx %v1) nounwind  {
+	store x86_mmx %v1, x86_mmx* @u1, align 8
 	ret void
 }
 
-@u2 = external global <1 x i64>
+@u2 = external global x86_mmx
 
 define void @t2(<1 x i64> %v1) nounwind  {
-	store <1 x i64> %v1, <1 x i64>* @u2, align 8
+        %tmp = bitcast <1 x i64> %v1 to x86_mmx
+	store x86_mmx %tmp, x86_mmx* @u2, align 8
 	ret void
 }
+
diff --git a/test/CodeGen/X86/mmx-arg-passing2.ll b/test/CodeGen/X86/mmx-arg-passing2.ll
index c42af08236..c132d311b9 100644
--- a/test/CodeGen/X86/mmx-arg-passing2.ll
+++ b/test/CodeGen/X86/mmx-arg-passing2.ll
@@ -1,17 +1,21 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | grep movq2dq | count 1
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+mmx,+sse2 | grep movdq2q | count 2
+; Since the add is not an MMX add, we don't have a movq2dq any more.
 
 @g_v8qi = external global <8 x i8>
 
 define void @t1() nounwind  {
 	%tmp3 = load <8 x i8>* @g_v8qi, align 8
-	%tmp4 = tail call i32 (...)* @pass_v8qi( <8 x i8> %tmp3 ) nounwind
+        %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
+	%tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind
 	ret void
 }
 
-define void @t2(<8 x i8> %v1, <8 x i8> %v2) nounwind  {
-       %tmp3 = add <8 x i8> %v1, %v2
-       %tmp4 = tail call i32 (...)* @pass_v8qi( <8 x i8> %tmp3 ) nounwind
+define void @t2(x86_mmx %v1, x86_mmx %v2) nounwind  {
+       %v1a = bitcast x86_mmx %v1 to <8 x i8>
+       %v2b = bitcast x86_mmx %v2 to <8 x i8>
+       %tmp3 = add <8 x i8> %v1a, %v2b
+       %tmp3a = bitcast <8 x i8> %tmp3 to x86_mmx
+       %tmp4 = tail call i32 (...)* @pass_v8qi( x86_mmx %tmp3a ) nounwind
        ret void
 }
 
diff --git a/test/CodeGen/X86/mmx-arith.ll b/test/CodeGen/X86/mmx-arith.ll
index e4dfdbfe1b..6817487324 100644
--- a/test/CodeGen/X86/mmx-arith.ll
+++ b/test/CodeGen/X86/mmx-arith.ll
@@ -1,131 +1,309 @@
 ; RUN: llc < %s -march=x86 -mattr=+mmx
 
 ;; A basic sanity check to make sure that MMX arithmetic actually compiles.
+;; First is a straight translation of the original with bitcasts as needed.
 
-define void @foo(<8 x i8>* %A, <8 x i8>* %B) {
+define void @foo(x86_mmx* %A, x86_mmx* %B) {
 entry:
-	%tmp1 = load <8 x i8>* %A		; <<8 x i8>> [#uses=1]
-	%tmp3 = load <8 x i8>* %B		; <<8 x i8>> [#uses=1]
-	%tmp4 = add <8 x i8> %tmp1, %tmp3		; <<8 x i8>> [#uses=2]
-	store <8 x i8> %tmp4, <8 x i8>* %A
-	%tmp7 = load <8 x i8>* %B		; <<8 x i8>> [#uses=1]
-	%tmp12 = tail call <8 x i8> @llvm.x86.mmx.padds.b( <8 x i8> %tmp4, <8 x i8> %tmp7 )		; <<8 x i8>> [#uses=2]
-	store <8 x i8> %tmp12, <8 x i8>* %A
-	%tmp16 = load <8 x i8>* %B		; <<8 x i8>> [#uses=1]
-	%tmp21 = tail call <8 x i8> @llvm.x86.mmx.paddus.b( <8 x i8> %tmp12, <8 x i8> %tmp16 )		; <<8 x i8>> [#uses=2]
-	store <8 x i8> %tmp21, <8 x i8>* %A
-	%tmp27 = load <8 x i8>* %B		; <<8 x i8>> [#uses=1]
-	%tmp28 = sub <8 x i8> %tmp21, %tmp27		; <<8 x i8>> [#uses=2]
-	store <8 x i8> %tmp28, <8 x i8>* %A
-	%tmp31 = load <8 x i8>* %B		; <<8 x i8>> [#uses=1]
-	%tmp36 = tail call <8 x i8> @llvm.x86.mmx.psubs.b( <8 x i8> %tmp28, <8 x i8> %tmp31 )		; <<8 x i8>> [#uses=2]
-	store <8 x i8> %tmp36, <8 x i8>* %A
-	%tmp40 = load <8 x i8>* %B		; <<8 x i8>> [#uses=1]
-	%tmp45 = tail call <8 x i8> @llvm.x86.mmx.psubus.b( <8 x i8> %tmp36, <8 x i8> %tmp40 )		; <<8 x i8>> [#uses=2]
-	store <8 x i8> %tmp45, <8 x i8>* %A
-	%tmp51 = load <8 x i8>* %B		; <<8 x i8>> [#uses=1]
-	%tmp52 = mul <8 x i8> %tmp45, %tmp51		; <<8 x i8>> [#uses=2]
-	store <8 x i8> %tmp52, <8 x i8>* %A
-	%tmp57 = load <8 x i8>* %B		; <<8 x i8>> [#uses=1]
-	%tmp58 = and <8 x i8> %tmp52, %tmp57		; <<8 x i8>> [#uses=2]
-	store <8 x i8> %tmp58, <8 x i8>* %A
-	%tmp63 = load <8 x i8>* %B		; <<8 x i8>> [#uses=1]
-	%tmp64 = or <8 x i8> %tmp58, %tmp63		; <<8 x i8>> [#uses=2]
-	store <8 x i8> %tmp64, <8 x i8>* %A
-	%tmp69 = load <8 x i8>* %B		; <<8 x i8>> [#uses=1]
-	%tmp70 = xor <8 x i8> %tmp64, %tmp69		; <<8 x i8>> [#uses=1]
-	store <8 x i8> %tmp70, <8 x i8>* %A
+	%tmp1 = load x86_mmx* %A		; <x86_mmx> [#uses=1]
+	%tmp3 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp1a = bitcast x86_mmx %tmp1 to <8 x i8>
+        %tmp3a = bitcast x86_mmx %tmp3 to <8 x i8>
+	%tmp4 = add <8 x i8> %tmp1a, %tmp3a		; <<8 x i8>> [#uses=2]
+        %tmp4a = bitcast <8 x i8> %tmp4 to x86_mmx
+	store x86_mmx %tmp4a, x86_mmx* %A
+	%tmp7 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b( x86_mmx %tmp4a, x86_mmx %tmp7 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp12, x86_mmx* %A
+	%tmp16 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b( x86_mmx %tmp12, x86_mmx %tmp16 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp21, x86_mmx* %A
+	%tmp27 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp21a = bitcast x86_mmx %tmp21 to <8 x i8>
+        %tmp27a = bitcast x86_mmx %tmp27 to <8 x i8>
+	%tmp28 = sub <8 x i8> %tmp21a, %tmp27a		; <<8 x i8>> [#uses=2]
+        %tmp28a = bitcast <8 x i8> %tmp28 to x86_mmx
+	store x86_mmx %tmp28a, x86_mmx* %A
+	%tmp31 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b( x86_mmx %tmp28a, x86_mmx %tmp31 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp36, x86_mmx* %A
+	%tmp40 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b( x86_mmx %tmp36, x86_mmx %tmp40 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp45, x86_mmx* %A
+	%tmp51 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp45a = bitcast x86_mmx %tmp45 to <8 x i8>
+        %tmp51a = bitcast x86_mmx %tmp51 to <8 x i8>
+	%tmp52 = mul <8 x i8> %tmp45a, %tmp51a		; <<8 x i8>> [#uses=2]
+        %tmp52a = bitcast <8 x i8> %tmp52 to x86_mmx
+	store x86_mmx %tmp52a, x86_mmx* %A
+	%tmp57 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp57a = bitcast x86_mmx %tmp57 to <8 x i8>
+	%tmp58 = and <8 x i8> %tmp52, %tmp57a		; <<8 x i8>> [#uses=2]
+        %tmp58a = bitcast <8 x i8> %tmp58 to x86_mmx
+	store x86_mmx %tmp58a, x86_mmx* %A
+	%tmp63 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp63a = bitcast x86_mmx %tmp63 to <8 x i8>
+	%tmp64 = or <8 x i8> %tmp58, %tmp63a		; <<8 x i8>> [#uses=2]
+        %tmp64a = bitcast <8 x i8> %tmp64 to x86_mmx
+	store x86_mmx %tmp64a, x86_mmx* %A
+	%tmp69 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp69a = bitcast x86_mmx %tmp69 to <8 x i8>
+        %tmp64b = bitcast x86_mmx %tmp64a to <8 x i8>
+	%tmp70 = xor <8 x i8> %tmp64b, %tmp69a		; <<8 x i8>> [#uses=1]
+        %tmp70a = bitcast <8 x i8> %tmp70 to x86_mmx
+	store x86_mmx %tmp70a, x86_mmx* %A
 	tail call void @llvm.x86.mmx.emms( )
 	ret void
 }
 
-define void @baz(<2 x i32>* %A, <2 x i32>* %B) {
+define void @baz(x86_mmx* %A, x86_mmx* %B) {
 entry:
-	%tmp1 = load <2 x i32>* %A		; <<2 x i32>> [#uses=1]
-	%tmp3 = load <2 x i32>* %B		; <<2 x i32>> [#uses=1]
-	%tmp4 = add <2 x i32> %tmp1, %tmp3		; <<2 x i32>> [#uses=2]
-	store <2 x i32> %tmp4, <2 x i32>* %A
-	%tmp9 = load <2 x i32>* %B		; <<2 x i32>> [#uses=1]
-	%tmp10 = sub <2 x i32> %tmp4, %tmp9		; <<2 x i32>> [#uses=2]
-	store <2 x i32> %tmp10, <2 x i32>* %A
-	%tmp15 = load <2 x i32>* %B		; <<2 x i32>> [#uses=1]
-	%tmp16 = mul <2 x i32> %tmp10, %tmp15		; <<2 x i32>> [#uses=2]
-	store <2 x i32> %tmp16, <2 x i32>* %A
-	%tmp21 = load <2 x i32>* %B		; <<2 x i32>> [#uses=1]
-	%tmp22 = and <2 x i32> %tmp16, %tmp21		; <<2 x i32>> [#uses=2]
-	store <2 x i32> %tmp22, <2 x i32>* %A
-	%tmp27 = load <2 x i32>* %B		; <<2 x i32>> [#uses=1]
-	%tmp28 = or <2 x i32> %tmp22, %tmp27		; <<2 x i32>> [#uses=2]
-	store <2 x i32> %tmp28, <2 x i32>* %A
-	%tmp33 = load <2 x i32>* %B		; <<2 x i32>> [#uses=1]
-	%tmp34 = xor <2 x i32> %tmp28, %tmp33		; <<2 x i32>> [#uses=1]
-	store <2 x i32> %tmp34, <2 x i32>* %A
+	%tmp1 = load x86_mmx* %A		; <x86_mmx> [#uses=1]
+	%tmp3 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp1a = bitcast x86_mmx %tmp1 to <2 x i32>
+        %tmp3a = bitcast x86_mmx %tmp3 to <2 x i32>
+	%tmp4 = add <2 x i32> %tmp1a, %tmp3a		; <<2 x i32>> [#uses=2]
+        %tmp4a = bitcast <2 x i32> %tmp4 to x86_mmx
+	store x86_mmx %tmp4a, x86_mmx* %A
+	%tmp9 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp9a = bitcast x86_mmx %tmp9 to <2 x i32>
+	%tmp10 = sub <2 x i32> %tmp4, %tmp9a		; <<2 x i32>> [#uses=2]
+        %tmp10a = bitcast <2 x i32> %tmp4 to x86_mmx
+	store x86_mmx %tmp10a, x86_mmx* %A
+	%tmp15 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp10b = bitcast x86_mmx %tmp10a to <2 x i32>
+        %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32>
+	%tmp16 = mul <2 x i32> %tmp10b, %tmp15a		; <<2 x i32>> [#uses=2]
+        %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx
+	store x86_mmx %tmp16a, x86_mmx* %A
+	%tmp21 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp16b = bitcast x86_mmx %tmp16a to <2 x i32>
+        %tmp21a = bitcast x86_mmx %tmp21 to <2 x i32>
+	%tmp22 = and <2 x i32> %tmp16b, %tmp21a		; <<2 x i32>> [#uses=2]
+        %tmp22a = bitcast <2 x i32> %tmp22 to x86_mmx
+	store x86_mmx %tmp22a, x86_mmx* %A
+	%tmp27 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp22b = bitcast x86_mmx %tmp22a to <2 x i32>
+        %tmp27a = bitcast x86_mmx %tmp27 to <2 x i32>
+	%tmp28 = or <2 x i32> %tmp22b, %tmp27a		; <<2 x i32>> [#uses=2]
+        %tmp28a = bitcast <2 x i32> %tmp28 to x86_mmx
+	store x86_mmx %tmp28a, x86_mmx* %A
+	%tmp33 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp28b = bitcast x86_mmx %tmp28a to <2 x i32>
+        %tmp33a = bitcast x86_mmx %tmp33 to <2 x i32>
+	%tmp34 = xor <2 x i32> %tmp28b, %tmp33a		; <<2 x i32>> [#uses=1]
+        %tmp34a = bitcast <2 x i32> %tmp34 to x86_mmx
+	store x86_mmx %tmp34a, x86_mmx* %A
 	tail call void @llvm.x86.mmx.emms( )
 	ret void
 }
 
-define void @bar(<4 x i16>* %A, <4 x i16>* %B) {
+define void @bar(x86_mmx* %A, x86_mmx* %B) {
 entry:
-	%tmp1 = load <4 x i16>* %A		; <<4 x i16>> [#uses=1]
-	%tmp3 = load <4 x i16>* %B		; <<4 x i16>> [#uses=1]
-	%tmp4 = add <4 x i16> %tmp1, %tmp3		; <<4 x i16>> [#uses=2]
-	store <4 x i16> %tmp4, <4 x i16>* %A
-	%tmp7 = load <4 x i16>* %B		; <<4 x i16>> [#uses=1]
-	%tmp12 = tail call <4 x i16> @llvm.x86.mmx.padds.w( <4 x i16> %tmp4, <4 x i16> %tmp7 )		; <<4 x i16>> [#uses=2]
-	store <4 x i16> %tmp12, <4 x i16>* %A
-	%tmp16 = load <4 x i16>* %B		; <<4 x i16>> [#uses=1]
-	%tmp21 = tail call <4 x i16> @llvm.x86.mmx.paddus.w( <4 x i16> %tmp12, <4 x i16> %tmp16 )		; <<4 x i16>> [#uses=2]
-	store <4 x i16> %tmp21, <4 x i16>* %A
-	%tmp27 = load <4 x i16>* %B		; <<4 x i16>> [#uses=1]
-	%tmp28 = sub <4 x i16> %tmp21, %tmp27		; <<4 x i16>> [#uses=2]
-	store <4 x i16> %tmp28, <4 x i16>* %A
-	%tmp31 = load <4 x i16>* %B		; <<4 x i16>> [#uses=1]
-	%tmp36 = tail call <4 x i16> @llvm.x86.mmx.psubs.w( <4 x i16> %tmp28, <4 x i16> %tmp31 )		; <<4 x i16>> [#uses=2]
-	store <4 x i16> %tmp36, <4 x i16>* %A
-	%tmp40 = load <4 x i16>* %B		; <<4 x i16>> [#uses=1]
-	%tmp45 = tail call <4 x i16> @llvm.x86.mmx.psubus.w( <4 x i16> %tmp36, <4 x i16> %tmp40 )		; <<4 x i16>> [#uses=2]
-	store <4 x i16> %tmp45, <4 x i16>* %A
-	%tmp51 = load <4 x i16>* %B		; <<4 x i16>> [#uses=1]
-	%tmp52 = mul <4 x i16> %tmp45, %tmp51		; <<4 x i16>> [#uses=2]
-	store <4 x i16> %tmp52, <4 x i16>* %A
-	%tmp55 = load <4 x i16>* %B		; <<4 x i16>> [#uses=1]
-	%tmp60 = tail call <4 x i16> @llvm.x86.mmx.pmulh.w( <4 x i16> %tmp52, <4 x i16> %tmp55 )		; <<4 x i16>> [#uses=2]
-	store <4 x i16> %tmp60, <4 x i16>* %A
-	%tmp64 = load <4 x i16>* %B		; <<4 x i16>> [#uses=1]
-	%tmp69 = tail call <2 x i32> @llvm.x86.mmx.pmadd.wd( <4 x i16> %tmp60, <4 x i16> %tmp64 )		; <<2 x i32>> [#uses=1]
-	%tmp70 = bitcast <2 x i32> %tmp69 to <4 x i16>		; <<4 x i16>> [#uses=2]
-	store <4 x i16> %tmp70, <4 x i16>* %A
-	%tmp75 = load <4 x i16>* %B		; <<4 x i16>> [#uses=1]
-	%tmp76 = and <4 x i16> %tmp70, %tmp75		; <<4 x i16>> [#uses=2]
-	store <4 x i16> %tmp76, <4 x i16>* %A
-	%tmp81 = load <4 x i16>* %B		; <<4 x i16>> [#uses=1]
-	%tmp82 = or <4 x i16> %tmp76, %tmp81		; <<4 x i16>> [#uses=2]
-	store <4 x i16> %tmp82, <4 x i16>* %A
-	%tmp87 = load <4 x i16>* %B		; <<4 x i16>> [#uses=1]
-	%tmp88 = xor <4 x i16> %tmp82, %tmp87		; <<4 x i16>> [#uses=1]
-	store <4 x i16> %tmp88, <4 x i16>* %A
+	%tmp1 = load x86_mmx* %A		; <x86_mmx> [#uses=1]
+	%tmp3 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp1a = bitcast x86_mmx %tmp1 to <4 x i16>
+        %tmp3a = bitcast x86_mmx %tmp3 to <4 x i16>
+	%tmp4 = add <4 x i16> %tmp1a, %tmp3a		; <<4 x i16>> [#uses=2]
+        %tmp4a = bitcast <4 x i16> %tmp4 to x86_mmx
+	store x86_mmx %tmp4a, x86_mmx* %A
+	%tmp7 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w( x86_mmx %tmp4a, x86_mmx %tmp7 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp12, x86_mmx* %A
+	%tmp16 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp12, x86_mmx %tmp16 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp21, x86_mmx* %A
+	%tmp27 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp21a = bitcast x86_mmx %tmp21 to <4 x i16>
+        %tmp27a = bitcast x86_mmx %tmp27 to <4 x i16>
+	%tmp28 = sub <4 x i16> %tmp21a, %tmp27a		; <<4 x i16>> [#uses=2]
+        %tmp28a = bitcast <4 x i16> %tmp28 to x86_mmx
+	store x86_mmx %tmp28a, x86_mmx* %A
+	%tmp31 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w( x86_mmx %tmp28a, x86_mmx %tmp31 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp36, x86_mmx* %A
+	%tmp40 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w( x86_mmx %tmp36, x86_mmx %tmp40 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp45, x86_mmx* %A
+	%tmp51 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp45a = bitcast x86_mmx %tmp45 to <4 x i16>
+        %tmp51a = bitcast x86_mmx %tmp51 to <4 x i16>
+	%tmp52 = mul <4 x i16> %tmp45a, %tmp51a		; <<4 x i16>> [#uses=2]
+        %tmp52a = bitcast <4 x i16> %tmp52 to x86_mmx
+	store x86_mmx %tmp52a, x86_mmx* %A
+	%tmp55 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w( x86_mmx %tmp52a, x86_mmx %tmp55 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp60, x86_mmx* %A
+	%tmp64 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd( x86_mmx %tmp60, x86_mmx %tmp64 )		; <x86_mmx> [#uses=1]
+	%tmp70 = bitcast x86_mmx %tmp69 to x86_mmx		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp70, x86_mmx* %A
+	%tmp75 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp70a = bitcast x86_mmx %tmp70 to <4 x i16>
+        %tmp75a = bitcast x86_mmx %tmp75 to <4 x i16>
+	%tmp76 = and <4 x i16> %tmp70a, %tmp75a		; <<4 x i16>> [#uses=2]
+        %tmp76a = bitcast <4 x i16> %tmp76 to x86_mmx
+	store x86_mmx %tmp76a, x86_mmx* %A
+	%tmp81 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp76b = bitcast x86_mmx %tmp76a to <4 x i16>
+        %tmp81a = bitcast x86_mmx %tmp81 to <4 x i16>
+	%tmp82 = or <4 x i16> %tmp76b, %tmp81a		; <<4 x i16>> [#uses=2]
+        %tmp82a = bitcast <4 x i16> %tmp82 to x86_mmx
+	store x86_mmx %tmp82a, x86_mmx* %A
+	%tmp87 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp82b = bitcast x86_mmx %tmp82a to <4 x i16>
+        %tmp87a = bitcast x86_mmx %tmp87 to <4 x i16>
+	%tmp88 = xor <4 x i16> %tmp82b, %tmp87a		; <<4 x i16>> [#uses=1]
+        %tmp88a = bitcast <4 x i16> %tmp88 to x86_mmx
+	store x86_mmx %tmp88a, x86_mmx* %A
 	tail call void @llvm.x86.mmx.emms( )
 	ret void
 }
 
-declare <8 x i8> @llvm.x86.mmx.padds.b(<8 x i8>, <8 x i8>)
+;; The following is modified to use MMX intrinsics everywhere they work.
 
-declare <8 x i8> @llvm.x86.mmx.paddus.b(<8 x i8>, <8 x i8>)
+define void @fooa(x86_mmx* %A, x86_mmx* %B) {
+entry:
+	%tmp1 = load x86_mmx* %A		; <x86_mmx> [#uses=1]
+	%tmp3 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp4 = tail call x86_mmx @llvm.x86.mmx.padd.b( x86_mmx %tmp1, x86_mmx %tmp3 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp4, x86_mmx* %A
+	%tmp7 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.b( x86_mmx %tmp4, x86_mmx %tmp7 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp12, x86_mmx* %A
+	%tmp16 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.b( x86_mmx %tmp12, x86_mmx %tmp16 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp21, x86_mmx* %A
+	%tmp27 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp28 = tail call x86_mmx @llvm.x86.mmx.psub.b( x86_mmx %tmp21, x86_mmx %tmp27 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp28, x86_mmx* %A
+	%tmp31 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.b( x86_mmx %tmp28, x86_mmx %tmp31 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp36, x86_mmx* %A
+	%tmp40 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.b( x86_mmx %tmp36, x86_mmx %tmp40 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp45, x86_mmx* %A
+	%tmp51 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp51a = bitcast x86_mmx %tmp51 to i64
+        %tmp51aa = bitcast i64 %tmp51a to <8 x i8>
+        %tmp51b = bitcast x86_mmx %tmp45 to <8 x i8>
+	%tmp52 = mul <8 x i8> %tmp51b, %tmp51aa		; <x86_mmx> [#uses=2]
+        %tmp52a = bitcast <8 x i8> %tmp52 to i64
+        %tmp52aa = bitcast i64 %tmp52a to x86_mmx
+	store x86_mmx %tmp52aa, x86_mmx* %A
+	%tmp57 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp58 = tail call x86_mmx @llvm.x86.mmx.pand( x86_mmx %tmp51, x86_mmx %tmp57 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp58, x86_mmx* %A
+	%tmp63 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp64 = tail call x86_mmx @llvm.x86.mmx.por( x86_mmx %tmp58, x86_mmx %tmp63 )		; <x86_mmx> [#uses=2]	
+	store x86_mmx %tmp64, x86_mmx* %A
+	%tmp69 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp70 = tail call x86_mmx @llvm.x86.mmx.pxor( x86_mmx %tmp64, x86_mmx %tmp69 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp70, x86_mmx* %A
+	tail call void @llvm.x86.mmx.emms( )
+	ret void
+}
 
-declare <8 x i8> @llvm.x86.mmx.psubs.b(<8 x i8>, <8 x i8>)
+define void @baza(x86_mmx* %A, x86_mmx* %B) {
+entry:
+	%tmp1 = load x86_mmx* %A		; <x86_mmx> [#uses=1]
+	%tmp3 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp4 = tail call x86_mmx @llvm.x86.mmx.padd.d( x86_mmx %tmp1, x86_mmx %tmp3 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp4, x86_mmx* %A
+	%tmp9 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp10 = tail call x86_mmx @llvm.x86.mmx.psub.d( x86_mmx %tmp4, x86_mmx %tmp9 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp10, x86_mmx* %A
+	%tmp15 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+        %tmp10a = bitcast x86_mmx %tmp10 to <2 x i32>
+        %tmp15a = bitcast x86_mmx %tmp15 to <2 x i32>
+	%tmp16 = mul <2 x i32> %tmp10a, %tmp15a		; <x86_mmx> [#uses=2]
+        %tmp16a = bitcast <2 x i32> %tmp16 to x86_mmx
+	store x86_mmx %tmp16a, x86_mmx* %A
+	%tmp21 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp22 = tail call x86_mmx @llvm.x86.mmx.pand( x86_mmx %tmp16a, x86_mmx %tmp21 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp22, x86_mmx* %A
+	%tmp27 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp28 = tail call x86_mmx @llvm.x86.mmx.por( x86_mmx %tmp22, x86_mmx %tmp27 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp28, x86_mmx* %A
+	%tmp33 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp34 = tail call x86_mmx @llvm.x86.mmx.pxor( x86_mmx %tmp28, x86_mmx %tmp33 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp34, x86_mmx* %A
+	tail call void @llvm.x86.mmx.emms( )
+	ret void
+}
 
-declare <8 x i8> @llvm.x86.mmx.psubus.b(<8 x i8>, <8 x i8>)
+define void @bara(x86_mmx* %A, x86_mmx* %B) {
+entry:
+	%tmp1 = load x86_mmx* %A		; <x86_mmx> [#uses=1]
+	%tmp3 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp4 = tail call x86_mmx @llvm.x86.mmx.padd.w( x86_mmx %tmp1, x86_mmx %tmp3 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp4, x86_mmx* %A
+	%tmp7 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp12 = tail call x86_mmx @llvm.x86.mmx.padds.w( x86_mmx %tmp4, x86_mmx %tmp7 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp12, x86_mmx* %A
+	%tmp16 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp21 = tail call x86_mmx @llvm.x86.mmx.paddus.w( x86_mmx %tmp12, x86_mmx %tmp16 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp21, x86_mmx* %A
+	%tmp27 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp28 = tail call x86_mmx @llvm.x86.mmx.psub.w( x86_mmx %tmp21, x86_mmx %tmp27 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp28, x86_mmx* %A
+	%tmp31 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp36 = tail call x86_mmx @llvm.x86.mmx.psubs.w( x86_mmx %tmp28, x86_mmx %tmp31 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp36, x86_mmx* %A
+	%tmp40 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp45 = tail call x86_mmx @llvm.x86.mmx.psubus.w( x86_mmx %tmp36, x86_mmx %tmp40 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp45, x86_mmx* %A
+	%tmp51 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp52 = tail call x86_mmx @llvm.x86.mmx.pmull.w( x86_mmx %tmp45, x86_mmx %tmp51 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp52, x86_mmx* %A
+	%tmp55 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp60 = tail call x86_mmx @llvm.x86.mmx.pmulh.w( x86_mmx %tmp52, x86_mmx %tmp55 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp60, x86_mmx* %A
+	%tmp64 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp69 = tail call x86_mmx @llvm.x86.mmx.pmadd.wd( x86_mmx %tmp60, x86_mmx %tmp64 )		; <x86_mmx> [#uses=1]
+	%tmp70 = bitcast x86_mmx %tmp69 to x86_mmx		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp70, x86_mmx* %A
+	%tmp75 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp76 = tail call x86_mmx @llvm.x86.mmx.pand( x86_mmx %tmp70, x86_mmx %tmp75 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp76, x86_mmx* %A
+	%tmp81 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp82 = tail call x86_mmx @llvm.x86.mmx.por( x86_mmx %tmp76, x86_mmx %tmp81 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp82, x86_mmx* %A
+	%tmp87 = load x86_mmx* %B		; <x86_mmx> [#uses=1]
+	%tmp88 = tail call x86_mmx @llvm.x86.mmx.pxor( x86_mmx %tmp82, x86_mmx %tmp87 )		; <x86_mmx> [#uses=2]
+	store x86_mmx %tmp88, x86_mmx* %A
+	tail call void @llvm.x86.mmx.emms( )
+	ret void
+}
 
-declare <4 x i16> @llvm.x86.mmx.padds.w(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
 
-declare <4 x i16> @llvm.x86.mmx.paddus.w(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.psubus.b(x86_mmx, x86_mmx)
 
-declare <4 x i16> @llvm.x86.mmx.psubs.w(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
 
-declare <4 x i16> @llvm.x86.mmx.psubus.w(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.psubus.w(x86_mmx, x86_mmx)
 
-declare <4 x i16> @llvm.x86.mmx.pmulh.w(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.pmulh.w(x86_mmx, x86_mmx)
 
-declare <2 x i32> @llvm.x86.mmx.pmadd.wd(<4 x i16>, <4 x i16>)
+declare x86_mmx @llvm.x86.mmx.pmadd.wd(x86_mmx, x86_mmx)
 
 declare void @llvm.x86.mmx.emms()
+
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padds.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padds.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padds.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psubs.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psubs.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psubs.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psub.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psub.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.psub.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.pmull.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.pand(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.por(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.pxor(x86_mmx, x86_mmx)
+
diff --git a/test/CodeGen/X86/mmx-bitcast-to-i64.ll b/test/CodeGen/X86/mmx-bitcast-to-i64.ll
index 1fd8f67a0c..8b1840abf6 100644
--- a/test/CodeGen/X86/mmx-bitcast-to-i64.ll
+++ b/test/CodeGen/X86/mmx-bitcast-to-i64.ll
@@ -1,26 +1,31 @@
 ; RUN: llc < %s -march=x86-64 | grep movd | count 4
 
-define i64 @foo(<1 x i64>* %p) {
-  %t = load <1 x i64>* %p
-  %u = add <1 x i64> %t, %t
-  %s = bitcast <1 x i64> %u to i64
+define i64 @foo(x86_mmx* %p) {
+  %t = load x86_mmx* %p
+  %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %t)
+  %s = bitcast x86_mmx %u to i64
   ret i64 %s
 }
-define i64 @goo(<2 x i32>* %p) {
-  %t = load <2 x i32>* %p
-  %u = add <2 x i32> %t, %t
-  %s = bitcast <2 x i32> %u to i64
+define i64 @goo(x86_mmx* %p) {
+  %t = load x86_mmx* %p
+  %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %t)
+  %s = bitcast x86_mmx %u to i64
   ret i64 %s
 }
-define i64 @hoo(<4 x i16>* %p) {
-  %t = load <4 x i16>* %p
-  %u = add <4 x i16> %t, %t
-  %s = bitcast <4 x i16> %u to i64
+define i64 @hoo(x86_mmx* %p) {
+  %t = load x86_mmx* %p
+  %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %t)
+  %s = bitcast x86_mmx %u to i64
   ret i64 %s
 }
-define i64 @ioo(<8 x i8>* %p) {
-  %t = load <8 x i8>* %p
-  %u = add <8 x i8> %t, %t
-  %s = bitcast <8 x i8> %u to i64
+define i64 @ioo(x86_mmx* %p) {
+  %t = load x86_mmx* %p
+  %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %t)
+  %s = bitcast x86_mmx %u to i64
   ret i64 %s
 }
+
+declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
diff --git a/test/CodeGen/X86/mmx-insert-element.ll b/test/CodeGen/X86/mmx-insert-element.ll
index a063ee1d6c..9f9bd8de2d 100644
--- a/test/CodeGen/X86/mmx-insert-element.ll
+++ b/test/CodeGen/X86/mmx-insert-element.ll
@@ -1,7 +1,9 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx | not grep movq
-; RUN: llc < %s -march=x86 -mattr=+mmx | grep psllq
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse | grep movq
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse | grep pshufd
+; This is not an MMX operation; promoted to XMM.
 
-define <2 x i32> @qux(i32 %A) nounwind {
+define x86_mmx @qux(i32 %A) nounwind {
 	%tmp3 = insertelement <2 x i32> < i32 0, i32 undef >, i32 %A, i32 1		; <<2 x i32>> [#uses=1]
-	ret <2 x i32> %tmp3
+        %tmp4 = bitcast <2 x i32> %tmp3 to x86_mmx
+	ret x86_mmx %tmp4
 }
diff --git a/test/CodeGen/X86/mmx-punpckhdq.ll b/test/CodeGen/X86/mmx-punpckhdq.ll
index 0af7e017b6..4818baedc4 100644
--- a/test/CodeGen/X86/mmx-punpckhdq.ll
+++ b/test/CodeGen/X86/mmx-punpckhdq.ll
@@ -1,4 +1,6 @@
+; RUN: llc < %s -march=x86 -mattr=+mmx | grep pextrd
 ; RUN: llc < %s -march=x86 -mattr=+mmx | grep punpckhdq | count 1
+; There are no MMX operations in bork; promoted to XMM.
 
 define void @bork(<1 x i64>* %x) {
 entry:
@@ -11,4 +13,16 @@ entry:
 	ret void
 }
 
+; pork uses MMX.
+
+define void @pork(x86_mmx* %x) {
+entry:
+	%tmp2 = load x86_mmx* %x		; <x86_mmx> [#uses=1]
+        %tmp9 = tail call x86_mmx @llvm.x86.mmx.punpckhdq (x86_mmx %tmp2, x86_mmx %tmp2)
+	store x86_mmx %tmp9, x86_mmx* %x
+	tail call void @llvm.x86.mmx.emms( )
+	ret void
+}
+
+declare x86_mmx @llvm.x86.mmx.punpckhdq(x86_mmx, x86_mmx)
 declare void @llvm.x86.mmx.emms()
diff --git a/test/CodeGen/X86/mmx-shift.ll b/test/CodeGen/X86/mmx-shift.ll
index dd0aa2ca31..bafc75444d 100644
--- a/test/CodeGen/X86/mmx-shift.ll
+++ b/test/CodeGen/X86/mmx-shift.ll
@@ -5,28 +5,28 @@
 
 define i64 @t1(<1 x i64> %mm1) nounwind  {
 entry:
-	%tmp6 = tail call <1 x i64> @llvm.x86.mmx.pslli.q( <1 x i64> %mm1, i32 32 )		; <<1 x i64>> [#uses=1]
-	%retval1112 = bitcast <1 x i64> %tmp6 to i64		; <i64> [#uses=1]
+        %tmp = bitcast <1 x i64> %mm1 to x86_mmx
+	%tmp6 = tail call x86_mmx @llvm.x86.mmx.pslli.q( x86_mmx %tmp, i32 32 )		; <x86_mmx> [#uses=1]
+        %retval1112 = bitcast x86_mmx %tmp6 to i64
 	ret i64 %retval1112
 }
 
-declare <1 x i64> @llvm.x86.mmx.pslli.q(<1 x i64>, i32) nounwind readnone 
+declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32) nounwind readnone 
 
-define i64 @t2(<2 x i32> %mm1, <2 x i32> %mm2) nounwind  {
+define i64 @t2(x86_mmx %mm1, x86_mmx %mm2) nounwind  {
 entry:
-	%tmp7 = tail call <2 x i32> @llvm.x86.mmx.psra.d( <2 x i32> %mm1, <2 x i32> %mm2 ) nounwind readnone 		; <<2 x i32>> [#uses=1]
-	%retval1112 = bitcast <2 x i32> %tmp7 to i64		; <i64> [#uses=1]
+	%tmp7 = tail call x86_mmx @llvm.x86.mmx.psra.d( x86_mmx %mm1, x86_mmx %mm2 ) nounwind readnone 		; <x86_mmx> [#uses=1]
+        %retval1112 = bitcast x86_mmx %tmp7 to i64
 	ret i64 %retval1112
 }
 
-declare <2 x i32> @llvm.x86.mmx.psra.d(<2 x i32>, <2 x i32>) nounwind readnone 
+declare x86_mmx @llvm.x86.mmx.psra.d(x86_mmx, x86_mmx) nounwind readnone 
 
-define i64 @t3(<1 x i64> %mm1, i32 %bits) nounwind  {
+define i64 @t3(x86_mmx %mm1, i32 %bits) nounwind  {
 entry:
-	%tmp6 = bitcast <1 x i64> %mm1 to <4 x i16>		; <<4 x i16>> [#uses=1]
-	%tmp8 = tail call <4 x i16> @llvm.x86.mmx.psrli.w( <4 x i16> %tmp6, i32 %bits ) nounwind readnone 		; <<4 x i16>> [#uses=1]
-	%retval1314 = bitcast <4 x i16> %tmp8 to i64		; <i64> [#uses=1]
+	%tmp8 = tail call x86_mmx @llvm.x86.mmx.psrli.w( x86_mmx %mm1, i32 %bits ) nounwind readnone 		; <x86_mmx> [#uses=1]
+        %retval1314 = bitcast x86_mmx %tmp8 to i64
 	ret i64 %retval1314
 }
 
-declare <4 x i16> @llvm.x86.mmx.psrli.w(<4 x i16>, i32) nounwind readnone 
+declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32) nounwind readnone 
diff --git a/test/CodeGen/X86/mmx-shuffle.ll b/test/CodeGen/X86/mmx-shuffle.ll
index e3125c7345..9f7501eb7c 100644
--- a/test/CodeGen/X86/mmx-shuffle.ll
+++ b/test/CodeGen/X86/mmx-shuffle.ll
@@ -22,8 +22,10 @@ entry:
 	%tmp542 = bitcast <2 x i32> %tmp529 to <4 x i16>		; <<4 x i16>> [#uses=1]
 	%tmp543 = add <4 x i16> %tmp542, < i16 0, i16 16448, i16 24672, i16 28784 >		; <<4 x i16>> [#uses=1]
 	%tmp555 = bitcast <4 x i16> %tmp543 to <8 x i8>		; <<8 x i8>> [#uses=1]
-	tail call void @llvm.x86.mmx.maskmovq( <8 x i8> zeroinitializer, <8 x i8> %tmp555, i8* null )
+        %tmp556 = bitcast <8 x i8> %tmp555 to x86_mmx
+        %tmp557 = bitcast <8 x i8> zeroinitializer to x86_mmx
+	tail call void @llvm.x86.mmx.maskmovq( x86_mmx %tmp557, x86_mmx %tmp556, i8* null )
 	ret void
 }
 
-declare void @llvm.x86.mmx.maskmovq(<8 x i8>, <8 x i8>, i8*)
+declare void @llvm.x86.mmx.maskmovq(x86_mmx, x86_mmx, i8*)
diff --git a/test/CodeGen/X86/mmx-vzmovl-2.ll b/test/CodeGen/X86/mmx-vzmovl-2.ll
index 8253c20032..4ad420b37c 100644
--- a/test/CodeGen/X86/mmx-vzmovl-2.ll
+++ b/test/CodeGen/X86/mmx-vzmovl-2.ll
@@ -4,7 +4,7 @@
 	%struct.vS1024 = type { [8 x <4 x i32>] }
 	%struct.vS512 = type { [4 x <4 x i32>] }
 
-declare <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64>, i32) nounwind readnone
+declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
 
 define void @t() nounwind {
 entry:
@@ -12,14 +12,18 @@ entry:
 
 bb554:		; preds = %bb554, %entry
 	%sum.0.reg2mem.0 = phi <1 x i64> [ %tmp562, %bb554 ], [ zeroinitializer, %entry ]		; <<1 x i64>> [#uses=1]
-	%0 = load <1 x i64>* null, align 8		; <<1 x i64>> [#uses=2]
-	%1 = bitcast <1 x i64> %0 to <2 x i32>		; <<2 x i32>> [#uses=1]
+	%0 = load x86_mmx* null, align 8		; <<1 x i64>> [#uses=2]
+	%1 = bitcast x86_mmx %0 to <2 x i32>		; <<2 x i32>> [#uses=1]
 	%tmp555 = and <2 x i32> %1, < i32 -1, i32 0 >		; <<2 x i32>> [#uses=1]
-	%2 = bitcast <2 x i32> %tmp555 to <1 x i64>		; <<1 x i64>> [#uses=1]
-	%3 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %0, i32 32) nounwind readnone		; <<1 x i64>> [#uses=1]
+	%2 = bitcast <2 x i32> %tmp555 to x86_mmx		; <<1 x i64>> [#uses=1]
+	%3 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 32) nounwind readnone		; <<1 x i64>> [#uses=1]
         store <1 x i64> %sum.0.reg2mem.0, <1 x i64>* null
-	%tmp558 = add <1 x i64> %sum.0.reg2mem.0, %2		; <<1 x i64>> [#uses=1]
-	%4 = call <1 x i64> @llvm.x86.mmx.psrli.q(<1 x i64> %tmp558, i32 32) nounwind readnone		; <<1 x i64>> [#uses=1]
-	%tmp562 = add <1 x i64> %4, %3		; <<1 x i64>> [#uses=1]
+        %tmp3 = bitcast x86_mmx %2 to <1 x i64>
+	%tmp558 = add <1 x i64> %sum.0.reg2mem.0, %tmp3		; <<1 x i64>> [#uses=1]
+        %tmp5 = bitcast <1 x i64> %tmp558 to x86_mmx
+	%4 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %tmp5, i32 32) nounwind readnone		; <<1 x i64>> [#uses=1]
+        %tmp6 = bitcast x86_mmx %4 to <1 x i64>
+        %tmp7 = bitcast x86_mmx %3 to <1 x i64>
+	%tmp562 = add <1 x i64> %tmp6, %tmp7		; <<1 x i64>> [#uses=1]
 	br label %bb554
 }
diff --git a/test/CodeGen/X86/mmx-vzmovl.ll b/test/CodeGen/X86/mmx-vzmovl.ll
index d21e240488..e8b34263c6 100644
--- a/test/CodeGen/X86/mmx-vzmovl.ll
+++ b/test/CodeGen/X86/mmx-vzmovl.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86-64 -mattr=+mmx | grep movd
-; RUN: llc < %s -march=x86-64 -mattr=+mmx | grep movq
+; RUN: llc < %s -march=x86-64 -mattr=+mmx | grep movq | count 2
+; There are no MMX operations here; this is promoted to XMM.
 
 define void @foo(<1 x i64>* %a, <1 x i64>* %b) nounwind {
 entry:
diff --git a/test/CodeGen/X86/vec_insert-5.ll b/test/CodeGen/X86/vec_insert-5.ll
index 291fc0454c..471cc1611f 100644
--- a/test/CodeGen/X86/vec_insert-5.ll
+++ b/test/CodeGen/X86/vec_insert-5.ll
@@ -1,15 +1,16 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse2 > %t
-; RUN: grep psllq %t | grep 32
+; RUN: grep shll %t | grep 12
 ; RUN: grep pslldq %t | grep 12
 ; RUN: grep psrldq %t | grep 8
 ; RUN: grep psrldq %t | grep 12
+; There are no MMX operations in @t1
 
-define void  @t1(i32 %a, <1 x i64>* %P) nounwind {
+define void  @t1(i32 %a, x86_mmx* %P) nounwind {
        %tmp12 = shl i32 %a, 12
        %tmp21 = insertelement <2 x i32> undef, i32 %tmp12, i32 1
        %tmp22 = insertelement <2 x i32> %tmp21, i32 0, i32 0
-       %tmp23 = bitcast <2 x i32> %tmp22 to <1 x i64>
-       store <1 x i64> %tmp23, <1 x i64>* %P
+       %tmp23 = bitcast <2 x i32> %tmp22 to x86_mmx
+       store x86_mmx %tmp23, x86_mmx* %P
        ret void
 }
 
diff --git a/test/CodeGen/X86/vec_insert-7.ll b/test/CodeGen/X86/vec_insert-7.ll
index 9ede10f63d..ea7f919304 100644
--- a/test/CodeGen/X86/vec_insert-7.ll
+++ b/test/CodeGen/X86/vec_insert-7.ll
@@ -1,8 +1,12 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx -mtriple=i686-apple-darwin9 -o - | grep punpckldq
+; RUN: llc < %s -march=x86 -mattr=+mmx,+sse -mtriple=i686-apple-darwin9 -o - | grep pinsrd | count 2
+; MMX insertelement is not available; these are promoted to XMM.
+; (Without SSE they are split to two ints, and the code is much better.)
 
-define <2 x i32> @mmx_movzl(<2 x i32> %x) nounwind  {
+define x86_mmx @mmx_movzl(x86_mmx %x) nounwind  {
 entry:
-	%tmp3 = insertelement <2 x i32> %x, i32 32, i32 0		; <<2 x i32>> [#uses=1]
+        %tmp = bitcast x86_mmx %x to <2 x i32> 
+	%tmp3 = insertelement <2 x i32> %tmp, i32 32, i32 0		; <<2 x i32>> [#uses=1]
 	%tmp8 = insertelement <2 x i32> %tmp3, i32 0, i32 1		; <<2 x i32>> [#uses=1]
-	ret <2 x i32> %tmp8
+        %tmp9 = bitcast <2 x i32> %tmp8 to x86_mmx
+	ret x86_mmx %tmp9
 }
diff --git a/test/CodeGen/X86/vec_zero_cse.ll b/test/CodeGen/X86/vec_zero_cse.ll
index 3b15d4cc40..8aa50945e6 100644
--- a/test/CodeGen/X86/vec_zero_cse.ll
+++ b/test/CodeGen/X86/vec_zero_cse.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pxor | count 2
-; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pcmpeqd | count 2
+; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pxor | count 1
+; RUN: llc < %s -relocation-model=static -march=x86 -mcpu=yonah | grep pcmpeqd | count 1
+; 64-bit stores here do not use MMX.
 
 @M1 = external global <1 x i64>
 @M2 = external global <2 x i32>