From 255f20f7f76e4ca1ac1c73294852cb6fcb18c77d Mon Sep 17 00:00:00 2001 From: Evan Cheng Date: Thu, 1 Apr 2010 06:04:33 +0000 Subject: Fix sdisel memcpy, memset, memmove lowering: 1. Makes it possible to lower with floating point loads and stores. 2. Avoid unaligned loads / stores unless it's fast. 3. Fix some memcpy lowering logic bug related to when to optimize a load from constant string into a constant. 4. Adjust x86 memcpy lowering threshold to make it more sane. 5. Fix x86 target hook so it uses vector and floating point memory ops more effectively. rdar://7774704 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@100090 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll | 2 +- test/CodeGen/X86/byval7.ll | 9 ++- test/CodeGen/X86/memcpy-2.ll | 100 ++++++++++++++++++++++++-- test/CodeGen/X86/memset-2.ll | 46 ++---------- test/CodeGen/X86/memset64-on-x86-32.ll | 2 +- test/CodeGen/X86/small-byval-memcpy.ll | 2 +- test/CodeGen/X86/unaligned-load.ll | 25 +++++-- 7 files changed, 130 insertions(+), 56 deletions(-) (limited to 'test/CodeGen') diff --git a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll index 3ce9edb5db..26bf09c723 100644 --- a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll +++ b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s ; rdar://7396984 @str = private constant [28 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1 diff --git a/test/CodeGen/X86/byval7.ll b/test/CodeGen/X86/byval7.ll index 0da93bad04..686ed9c74d 100644 --- a/test/CodeGen/X86/byval7.ll +++ b/test/CodeGen/X86/byval7.ll @@ -1,10 +1,17 @@ -; RUN: llc < %s -march=x86 -mcpu=yonah | egrep {add|lea} | grep 16 +; RUN: llc < %s -march=x86 -mcpu=yonah | FileCheck %s %struct.S = type { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, + <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } define i32 @main() nounwind { entry: +; CHECK: main: +; CHECK: movl $1, (%esp) +; CHECK: leal 16(%esp), %edi +; CHECK: movl $36, %ecx +; CHECK: leal 160(%esp), %esi +; CHECK: rep;movsl %s = alloca %struct.S ; <%struct.S*> [#uses=2] %tmp15 = getelementptr %struct.S* %s, i32 0, i32 0 ; <<2 x i64>*> [#uses=1] store <2 x i64> < i64 8589934595, i64 1 >, <2 x i64>* %tmp15, align 16 diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll index 2dc939e666..079c40252b 100644 --- a/test/CodeGen/X86/memcpy-2.ll +++ b/test/CodeGen/X86/memcpy-2.ll @@ -1,15 +1,105 @@ -; RUN: llc < %s -march=x86 -mattr=-sse -mtriple=i686-apple-darwin8.8.0 | grep mov | count 7 -; RUN: llc < %s -march=x86 -mattr=+sse -mtriple=i686-apple-darwin8.8.0 | grep mov | count 5 +; RUN: llc < %s -mattr=+sse2 -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=SSE2 +; RUN: llc < %s -mattr=+sse,-sse2 -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=SSE1 +; RUN: llc < %s -mattr=-sse -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=NOSSE %struct.ParmT = type { [25 x i8], i8, i8* } @.str12 = internal constant [25 x i8] c"image\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00\00" ; <[25 x i8]*> [#uses=1] -declare void @llvm.memcpy.i32(i8*, i8*, i32, i32) nounwind - -define void @t(i32 %argc, i8** %argv) nounwind { +define void @t1(i32 %argc, i8** %argv) nounwind { entry: +; SSE2: t1: +; SSE2: movaps _.str12, %xmm0 +; SSE2: movaps %xmm0 +; SSE2: movb $0 +; SSE2: movl $0 +; SSE2: movl $0 + +; SSE1: t1: +; SSE1: movaps _.str12, %xmm0 +; SSE1: movaps %xmm0 +; SSE1: movb $0 +; SSE1: movl $0 +; SSE1: movl $0 + +; NOSSE: t1: +; NOSSE: movb $0 +; NOSSE: movl $0 +; NOSSE: movl $0 +; NOSSE: movl $0 +; NOSSE: movl $0 +; NOSSE: movl $101 +; NOSSE: movl $1734438249 %parms.i = alloca [13 x %struct.ParmT] ; <[13 x %struct.ParmT]*> [#uses=1] %parms1.i = getelementptr [13 x %struct.ParmT]* %parms.i, i32 0, i32 0, i32 0, i32 0 ; [#uses=1] call void @llvm.memcpy.i32( i8* %parms1.i, i8* getelementptr ([25 x i8]* @.str12, i32 0, i32 0), i32 25, i32 1 ) nounwind unreachable } + +;rdar://7774704 +%struct.s0 = type { [2 x double] } + +define void @t2(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp { +entry: +; SSE2: t2: +; SSE2: movaps (%eax), %xmm0 +; SSE2: movaps %xmm0, (%eax) + +; SSE1: t2: +; SSE1: movaps (%eax), %xmm0 +; SSE1: movaps %xmm0, (%eax) + +; NOSSE: t2: +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl + %tmp2 = bitcast %struct.s0* %a to i8* ; [#uses=1] + %tmp3 = bitcast %struct.s0* %b to i8* ; [#uses=1] + tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 16) + ret void +} + +define void @t3(%struct.s0* nocapture %a, %struct.s0* nocapture %b) nounwind ssp { +entry: +; SSE2: t3: +; SSE2: movsd (%eax), %xmm0 +; SSE2: movsd 8(%eax), %xmm1 +; SSE2: movsd %xmm1, 8(%eax) +; SSE2: movsd %xmm0, (%eax) + +; SSE1: t3: +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl +; SSE1: movl + +; NOSSE: t3: +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl +; NOSSE: movl + %tmp2 = bitcast %struct.s0* %a to i8* ; [#uses=1] + %tmp3 = bitcast %struct.s0* %b to i8* ; [#uses=1] + tail call void @llvm.memcpy.i32(i8* %tmp2, i8* %tmp3, i32 16, i32 8) + ret void +} + +declare void @llvm.memcpy.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll index 7deb52f807..e2eba76da8 100644 --- a/test/CodeGen/X86/memset-2.ll +++ b/test/CodeGen/X86/memset-2.ll @@ -1,47 +1,13 @@ -; RUN: llc < %s | not grep rep -; RUN: llc < %s | grep memset +; RUN: llc < %s | FileCheck %s target triple = "i386" declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind -define fastcc i32 @cli_scanzip(i32 %desc) nounwind { +define fastcc void @t() nounwind { entry: - br label %bb8.i.i.i.i - -bb8.i.i.i.i: ; preds = %bb8.i.i.i.i, %entry - icmp eq i32 0, 0 ; :0 [#uses=1] - br i1 %0, label %bb61.i.i.i, label %bb8.i.i.i.i - -bb32.i.i.i: ; preds = %bb61.i.i.i - ptrtoint i8* %tail.0.i.i.i to i32 ; :1 [#uses=1] - sub i32 0, %1 ; :2 [#uses=1] - icmp sgt i32 %2, 19 ; :3 [#uses=1] - br i1 %3, label %bb34.i.i.i, label %bb61.i.i.i - -bb34.i.i.i: ; preds = %bb32.i.i.i - load i32* null, align 4 ; :4 [#uses=1] - icmp eq i32 %4, 101010256 ; :5 [#uses=1] - br i1 %5, label %bb8.i11.i.i.i, label %bb61.i.i.i - -bb8.i11.i.i.i: ; preds = %bb8.i11.i.i.i, %bb34.i.i.i - icmp eq i32 0, 0 ; :6 [#uses=1] - br i1 %6, label %cli_dbgmsg.exit49.i, label %bb8.i11.i.i.i - -cli_dbgmsg.exit49.i: ; preds = %bb8.i11.i.i.i - icmp eq [32768 x i8]* null, null ; :7 [#uses=1] - br i1 %7, label %bb1.i28.i, label %bb8.i.i - -bb61.i.i.i: ; preds = %bb61.i.i.i, %bb34.i.i.i, %bb32.i.i.i, %bb8.i.i.i.i - %tail.0.i.i.i = getelementptr [1024 x i8]* null, i32 0, i32 0 ; [#uses=2] - load i8* %tail.0.i.i.i, align 1 ; :8 [#uses=1] - icmp eq i8 %8, 80 ; :9 [#uses=1] - br i1 %9, label %bb32.i.i.i, label %bb61.i.i.i - -bb1.i28.i: ; preds = %cli_dbgmsg.exit49.i - call void @llvm.memset.i32( i8* null, i8 0, i32 88, i32 1 ) nounwind - unreachable - -bb8.i.i: ; preds = %bb8.i.i, %cli_dbgmsg.exit49.i - br label %bb8.i.i +; CHECK: t: +; CHECK: call memset + call void @llvm.memset.i32( i8* null, i8 0, i32 188, i32 1 ) nounwind + unreachable } diff --git a/test/CodeGen/X86/memset64-on-x86-32.ll b/test/CodeGen/X86/memset64-on-x86-32.ll index da8fc51da8..8b817b49ee 100644 --- a/test/CodeGen/X86/memset64-on-x86-32.ll +++ b/test/CodeGen/X86/memset64-on-x86-32.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=i386-apple-darwin | grep stosl +; RUN: llc < %s -mtriple=i386-apple-darwin | grep movl | count 20 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | grep movq | count 10 define void @bork() nounwind { diff --git a/test/CodeGen/X86/small-byval-memcpy.ll b/test/CodeGen/X86/small-byval-memcpy.ll index 9ec9182e5e..711dc51e4d 100644 --- a/test/CodeGen/X86/small-byval-memcpy.ll +++ b/test/CodeGen/X86/small-byval-memcpy.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s | not grep movs +; RUN: llc < %s | grep movsd | count 8 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i386-apple-darwin8" diff --git a/test/CodeGen/X86/unaligned-load.ll b/test/CodeGen/X86/unaligned-load.ll index b61803d10c..2e9b248314 100644 --- a/test/CodeGen/X86/unaligned-load.ll +++ b/test/CodeGen/X86/unaligned-load.ll @@ -1,4 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=core2 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=CORE2 %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin10.0 -mcpu=corei7 -relocation-model=dynamic-no-pic --asm-verbose=0 | FileCheck -check-prefix=COREI7 %s @.str1 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, SOME STRING\00", align 8 @.str3 = internal constant [31 x i8] c"DHRYSTONE PROGRAM, 2'ND STRING\00", align 8 @@ -11,7 +12,11 @@ entry: bb: %String2Loc9 = getelementptr inbounds [31 x i8]* %String2Loc, i64 0, i64 0 call void @llvm.memcpy.i64(i8* %String2Loc9, i8* getelementptr inbounds ([31 x i8]* @.str3, i64 0, i64 0), i64 31, i32 1) -; CHECK: movups _.str3 +; CORE2: movsd _.str3+16 +; CORE2: movsd _.str3+8 +; CORE2: movsd _.str3 + +; COREI7: movups _.str3 br label %bb return: @@ -20,8 +25,14 @@ return: declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind -; CHECK: .align 3 -; CHECK-NEXT: _.str1: -; CHECK-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING" -; CHECK: .align 3 -; CHECK-NEXT: _.str3: +; CORE2: .align 3 +; CORE2-NEXT: _.str1: +; CORE2-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING" +; CORE2: .align 3 +; CORE2-NEXT: _.str3: + +; COREI7: .align 3 +; COREI7-NEXT: _.str1: +; COREI7-NEXT: .asciz "DHRYSTONE PROGRAM, SOME STRING" +; COREI7: .align 3 +; COREI7-NEXT: _.str3: -- cgit v1.2.3