diff options
author | Chris Lattner <sabre@nondot.org> | 2006-05-19 20:51:43 +0000 |
---|---|---|
committer | Chris Lattner <sabre@nondot.org> | 2006-05-19 20:51:43 +0000 |
commit | c11ab17a8e384ab1ee2642a0640581fed515b158 (patch) | |
tree | f10fe4d442988d85377ef3bb2c450280372e0786 /lib/Target/X86/README-SSE.txt | |
parent | 2420d812475ebbb835585db1b2bbad04e55cb6f3 (diff) | |
download | llvm-c11ab17a8e384ab1ee2642a0640581fed515b158.tar.gz llvm-c11ab17a8e384ab1ee2642a0640581fed515b158.tar.bz2 llvm-c11ab17a8e384ab1ee2642a0640581fed515b158.tar.xz |
Split the SSE readme items out into their own README.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@28400 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/X86/README-SSE.txt')
-rw-r--r-- | lib/Target/X86/README-SSE.txt | 662 |
1 files changed, 662 insertions, 0 deletions
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt new file mode 100644 index 0000000000..123446fae9 --- /dev/null +++ b/lib/Target/X86/README-SSE.txt @@ -0,0 +1,662 @@ +//===---------------------------------------------------------------------===// +// Random ideas for the X86 backend: SSE-specific stuff. +//===---------------------------------------------------------------------===// + +//===---------------------------------------------------------------------===// + +When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and +other fast SSE modes. + +//===---------------------------------------------------------------------===// + +Think about doing i64 math in SSE regs. + +//===---------------------------------------------------------------------===// + +This testcase should have no SSE instructions in it, and only one load from +a constant pool: + +double %test3(bool %B) { + %C = select bool %B, double 123.412, double 523.01123123 + ret double %C +} + +Currently, the select is being lowered, which prevents the dag combiner from +turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' + +The pattern isel got this one right. + +//===---------------------------------------------------------------------===// + +SSE doesn't have [mem] op= reg instructions. If we have an SSE instruction +like this: + + X += y + +and the register allocator decides to spill X, it is cheaper to emit this as: + +Y += [xslot] +store Y -> [xslot] + +than as: + +tmp = [xslot] +tmp += y +store tmp -> [xslot] + +..and this uses one fewer register (so this should be done at load folding +time, not at spiller time). *Note* however that this can only be done +if Y is dead. Here's a testcase: + +%.str_3 = external global [15 x sbyte] ; <[15 x sbyte]*> [#uses=0] +implementation ; Functions: +declare void %printf(int, ...) +void %main() { +build_tree.exit: + br label %no_exit.i7 +no_exit.i7: ; preds = %no_exit.i7, %build_tree.exit + %tmp.0.1.0.i9 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.34.i18, %no_exit.i7 ] ; <double> [#uses=1] + %tmp.0.0.0.i10 = phi double [ 0.000000e+00, %build_tree.exit ], [ %tmp.28.i16, %no_exit.i7 ] ; <double> [#uses=1] + %tmp.28.i16 = add double %tmp.0.0.0.i10, 0.000000e+00 + %tmp.34.i18 = add double %tmp.0.1.0.i9, 0.000000e+00 + br bool false, label %Compute_Tree.exit23, label %no_exit.i7 +Compute_Tree.exit23: ; preds = %no_exit.i7 + tail call void (int, ...)* %printf( int 0 ) + store double %tmp.34.i18, double* null + ret void +} + +We currently emit: + +.BBmain_1: + xorpd %XMM1, %XMM1 + addsd %XMM0, %XMM1 +*** movsd %XMM2, QWORD PTR [%ESP + 8] +*** addsd %XMM2, %XMM1 +*** movsd QWORD PTR [%ESP + 8], %XMM2 + jmp .BBmain_1 # no_exit.i7 + +This is a bugpoint reduced testcase, which is why the testcase doesn't make +much sense (e.g. its an infinite loop). :) + +//===---------------------------------------------------------------------===// + +SSE should implement 'select_cc' using 'emulated conditional moves' that use +pcmp/pand/pandn/por to do a selection instead of a conditional branch: + +double %X(double %Y, double %Z, double %A, double %B) { + %C = setlt double %A, %B + %z = add double %Z, 0.0 ;; select operand is not a load + %D = select bool %C, double %Y, double %z + ret double %D +} + +We currently emit: + +_X: + subl $12, %esp + xorpd %xmm0, %xmm0 + addsd 24(%esp), %xmm0 + movsd 32(%esp), %xmm1 + movsd 16(%esp), %xmm2 + ucomisd 40(%esp), %xmm1 + jb LBB_X_2 +LBB_X_1: + movsd %xmm0, %xmm2 +LBB_X_2: + movsd %xmm2, (%esp) + fldl (%esp) + addl $12, %esp + ret + +//===---------------------------------------------------------------------===// + +It's not clear whether we should use pxor or xorps / xorpd to clear XMM +registers. The choice may depend on subtarget information. We should do some +more experiments on different x86 machines. + +//===---------------------------------------------------------------------===// + +Currently the x86 codegen isn't very good at mixing SSE and FPStack +code: + +unsigned int foo(double x) { return x; } + +foo: + subl $20, %esp + movsd 24(%esp), %xmm0 + movsd %xmm0, 8(%esp) + fldl 8(%esp) + fisttpll (%esp) + movl (%esp), %eax + addl $20, %esp + ret + +This will be solved when we go to a dynamic programming based isel. + +//===---------------------------------------------------------------------===// + +Should generate min/max for stuff like: + +void minf(float a, float b, float *X) { + *X = a <= b ? a : b; +} + +Make use of floating point min / max instructions. Perhaps introduce ISD::FMIN +and ISD::FMAX node types? + +//===---------------------------------------------------------------------===// + +The first BB of this code: + +declare bool %foo() +int %bar() { + %V = call bool %foo() + br bool %V, label %T, label %F +T: + ret int 1 +F: + call bool %foo() + ret int 12 +} + +compiles to: + +_bar: + subl $12, %esp + call L_foo$stub + xorb $1, %al + testb %al, %al + jne LBB_bar_2 # F + +It would be better to emit "cmp %al, 1" than a xor and test. + +//===---------------------------------------------------------------------===// + +Lower memcpy / memset to a series of SSE 128 bit move instructions when it's +feasible. + +//===---------------------------------------------------------------------===// + +Teach the coalescer to commute 2-addr instructions, allowing us to eliminate +the reg-reg copy in this example: + +float foo(int *x, float *y, unsigned c) { + float res = 0.0; + unsigned i; + for (i = 0; i < c; i++) { + float xx = (float)x[i]; + xx = xx * y[i]; + xx += res; + res = xx; + } + return res; +} + +LBB_foo_3: # no_exit + cvtsi2ss %XMM0, DWORD PTR [%EDX + 4*%ESI] + mulss %XMM0, DWORD PTR [%EAX + 4*%ESI] + addss %XMM0, %XMM1 + inc %ESI + cmp %ESI, %ECX +**** movaps %XMM1, %XMM0 + jb LBB_foo_3 # no_exit + +//===---------------------------------------------------------------------===// + +Codegen: + if (copysign(1.0, x) == copysign(1.0, y)) +into: + if (x^y & mask) +when using SSE. + +//===---------------------------------------------------------------------===// + +Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half +of a v4sf value. + +//===---------------------------------------------------------------------===// + +Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. +Perhaps use pxor / xorp* to clear a XMM register first? + +//===---------------------------------------------------------------------===// + +Better codegen for: + +void f(float a, float b, vector float * out) { *out = (vector float){ a, 0.0, 0.0, b}; } +void f(float a, float b, vector float * out) { *out = (vector float){ a, b, 0.0, 0}; } + +For the later we generate: + +_f: + pxor %xmm0, %xmm0 + movss 8(%esp), %xmm1 + movaps %xmm0, %xmm2 + unpcklps %xmm1, %xmm2 + movss 4(%esp), %xmm1 + unpcklps %xmm0, %xmm1 + unpcklps %xmm2, %xmm1 + movl 12(%esp), %eax + movaps %xmm1, (%eax) + ret + +This seems like it should use shufps, one for each of a & b. + +//===---------------------------------------------------------------------===// + +How to decide when to use the "floating point version" of logical ops? Here are +some code fragments: + + movaps LCPI5_5, %xmm2 + divps %xmm1, %xmm2 + mulps %xmm2, %xmm3 + mulps 8656(%ecx), %xmm3 + addps 8672(%ecx), %xmm3 + andps LCPI5_6, %xmm2 + andps LCPI5_1, %xmm3 + por %xmm2, %xmm3 + movdqa %xmm3, (%edi) + + movaps LCPI5_5, %xmm1 + divps %xmm0, %xmm1 + mulps %xmm1, %xmm3 + mulps 8656(%ecx), %xmm3 + addps 8672(%ecx), %xmm3 + andps LCPI5_6, %xmm1 + andps LCPI5_1, %xmm3 + orps %xmm1, %xmm3 + movaps %xmm3, 112(%esp) + movaps %xmm3, (%ebx) + +Due to some minor source change, the later case ended up using orps and movaps +instead of por and movdqa. Does it matter? + +//===---------------------------------------------------------------------===// + +Use movddup to splat a v2f64 directly from a memory source. e.g. + +#include <emmintrin.h> + +void test(__m128d *r, double A) { + *r = _mm_set1_pd(A); +} + +llc: + +_test: + movsd 8(%esp), %xmm0 + unpcklpd %xmm0, %xmm0 + movl 4(%esp), %eax + movapd %xmm0, (%eax) + ret + +icc: + +_test: + movl 4(%esp), %eax + movddup 8(%esp), %xmm0 + movapd %xmm0, (%eax) + ret + +//===---------------------------------------------------------------------===// + +X86RegisterInfo::copyRegToReg() returns X86::MOVAPSrr for VR128. Is it possible +to choose between movaps, movapd, and movdqa based on types of source and +destination? + +How about andps, andpd, and pand? Do we really care about the type of the packed +elements? If not, why not always use the "ps" variants which are likely to be +shorter. + +//===---------------------------------------------------------------------===// + +We are emitting bad code for this: + +float %test(float* %V, int %I, int %D, float %V) { +entry: + %tmp = seteq int %D, 0 + br bool %tmp, label %cond_true, label %cond_false23 + +cond_true: + %tmp3 = getelementptr float* %V, int %I + %tmp = load float* %tmp3 + %tmp5 = setgt float %tmp, %V + %tmp6 = tail call bool %llvm.isunordered.f32( float %tmp, float %V ) + %tmp7 = or bool %tmp5, %tmp6 + br bool %tmp7, label %UnifiedReturnBlock, label %cond_next + +cond_next: + %tmp10 = add int %I, 1 + %tmp12 = getelementptr float* %V, int %tmp10 + %tmp13 = load float* %tmp12 + %tmp15 = setle float %tmp13, %V + %tmp16 = tail call bool %llvm.isunordered.f32( float %tmp13, float %V ) + %tmp17 = or bool %tmp15, %tmp16 + %retval = select bool %tmp17, float 0.000000e+00, float 1.000000e+00 + ret float %retval + +cond_false23: + %tmp28 = tail call float %foo( float* %V, int %I, int %D, float %V ) + ret float %tmp28 + +UnifiedReturnBlock: ; preds = %cond_true + ret float 0.000000e+00 +} + +declare bool %llvm.isunordered.f32(float, float) + +declare float %foo(float*, int, int, float) + + +It exposes a known load folding problem: + + movss (%edx,%ecx,4), %xmm1 + ucomiss %xmm1, %xmm0 + +As well as this: + +LBB_test_2: # cond_next + movss LCPI1_0, %xmm2 + pxor %xmm3, %xmm3 + ucomiss %xmm0, %xmm1 + jbe LBB_test_6 # cond_next +LBB_test_5: # cond_next + movaps %xmm2, %xmm3 +LBB_test_6: # cond_next + movss %xmm3, 40(%esp) + flds 40(%esp) + addl $44, %esp + ret + +Clearly it's unnecessary to clear %xmm3. It's also not clear why we are emitting +three moves (movss, movaps, movss). + +//===---------------------------------------------------------------------===// + +External test Nurbs exposed some problems. Look for +__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc +emits: + + movaps (%edx), %xmm2 #59.21 + movaps (%edx), %xmm5 #60.21 + movaps (%edx), %xmm4 #61.21 + movaps (%edx), %xmm3 #62.21 + movl 40(%ecx), %ebp #69.49 + shufps $0, %xmm2, %xmm5 #60.21 + movl 100(%esp), %ebx #69.20 + movl (%ebx), %edi #69.20 + imull %ebp, %edi #69.49 + addl (%eax), %edi #70.33 + shufps $85, %xmm2, %xmm4 #61.21 + shufps $170, %xmm2, %xmm3 #62.21 + shufps $255, %xmm2, %xmm2 #63.21 + lea (%ebp,%ebp,2), %ebx #69.49 + negl %ebx #69.49 + lea -3(%edi,%ebx), %ebx #70.33 + shll $4, %ebx #68.37 + addl 32(%ecx), %ebx #68.37 + testb $15, %bl #91.13 + jne L_B1.24 # Prob 5% #91.13 + +This is the llvm code after instruction scheduling: + +cond_next140 (0xa910740, LLVM BB @0xa90beb0): + %reg1078 = MOV32ri -3 + %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0 + %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40 + %reg1080 = IMUL32rr %reg1079, %reg1037 + %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0 + %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 + %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32 + %reg1082 = SHL32ri %reg1038, 4 + %reg1039 = ADD32rr %reg1036, %reg1082 + %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0 + %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 + %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 + %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 + %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 + %reg1040 = MOV32rr %reg1039 + %reg1084 = AND32ri8 %reg1039, 15 + CMP32ri8 %reg1084, 0 + JE mbb<cond_next204,0xa914d30> + +Still ok. After register allocation: + +cond_next140 (0xa910740, LLVM BB @0xa90beb0): + %EAX = MOV32ri -3 + %EDX = MOV32rm <fi#3>, 1, %NOREG, 0 + ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0 + %EDX = MOV32rm <fi#7>, 1, %NOREG, 0 + %EDX = MOV32rm %EDX, 1, %NOREG, 40 + IMUL32rr %EAX<def&use>, %EDX + %ESI = MOV32rm <fi#5>, 1, %NOREG, 0 + %ESI = MOV32rm %ESI, 1, %NOREG, 0 + MOV32mr <fi#4>, 1, %NOREG, 0, %ESI + %EAX = LEA32r %ESI, 1, %EAX, -3 + %ESI = MOV32rm <fi#7>, 1, %NOREG, 0 + %ESI = MOV32rm %ESI, 1, %NOREG, 32 + %EDI = MOV32rr %EAX + SHL32ri %EDI<def&use>, 4 + ADD32rr %EDI<def&use>, %ESI + %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0 + %XMM1 = MOVAPSrr %XMM0 + SHUFPSrr %XMM1<def&use>, %XMM1, 170 + %XMM2 = MOVAPSrr %XMM0 + SHUFPSrr %XMM2<def&use>, %XMM2, 0 + %XMM3 = MOVAPSrr %XMM0 + SHUFPSrr %XMM3<def&use>, %XMM3, 255 + SHUFPSrr %XMM0<def&use>, %XMM0, 85 + %EBX = MOV32rr %EDI + AND32ri8 %EBX<def&use>, 15 + CMP32ri8 %EBX, 0 + JE mbb<cond_next204,0xa914d30> + +This looks really bad. The problem is shufps is a destructive opcode. Since it +appears as operand two in more than one shufps ops. It resulted in a number of +copies. Note icc also suffers from the same problem. Either the instruction +selector should select pshufd or The register allocator can made the two-address +to three-address transformation. + +It also exposes some other problems. See MOV32ri -3 and the spills. + +//===---------------------------------------------------------------------===// + +http://gcc.gnu.org/bugzilla/show_bug.cgi?id=25500 + +LLVM is producing bad code. + +LBB_main_4: # cond_true44 + addps %xmm1, %xmm2 + subps %xmm3, %xmm2 + movaps (%ecx), %xmm4 + movaps %xmm2, %xmm1 + addps %xmm4, %xmm1 + addl $16, %ecx + incl %edx + cmpl $262144, %edx + movaps %xmm3, %xmm2 + movaps %xmm4, %xmm3 + jne LBB_main_4 # cond_true44 + +There are two problems. 1) No need to two loop induction variables. We can +compare against 262144 * 16. 2) Known register coalescer issue. We should +be able eliminate one of the movaps: + + addps %xmm2, %xmm1 <=== Commute! + subps %xmm3, %xmm1 + movaps (%ecx), %xmm4 + movaps %xmm1, %xmm1 <=== Eliminate! + addps %xmm4, %xmm1 + addl $16, %ecx + incl %edx + cmpl $262144, %edx + movaps %xmm3, %xmm2 + movaps %xmm4, %xmm3 + jne LBB_main_4 # cond_true44 + +//===---------------------------------------------------------------------===// + +Consider: + +__m128 test(float a) { + return _mm_set_ps(0.0, 0.0, 0.0, a*a); +} + +This compiles into: + +movss 4(%esp), %xmm1 +mulss %xmm1, %xmm1 +xorps %xmm0, %xmm0 +movss %xmm1, %xmm0 +ret + +Because mulss doesn't modify the top 3 elements, the top elements of +xmm1 are already zero'd. We could compile this to: + +movss 4(%esp), %xmm0 +mulss %xmm0, %xmm0 +ret + +//===---------------------------------------------------------------------===// + +Here's a sick and twisted idea. Consider code like this: + +__m128 test(__m128 a) { + float b = *(float*)&A; + ... + return _mm_set_ps(0.0, 0.0, 0.0, b); +} + +This might compile to this code: + +movaps c(%esp), %xmm1 +xorps %xmm0, %xmm0 +movss %xmm1, %xmm0 +ret + +Now consider if the ... code caused xmm1 to get spilled. This might produce +this code: + +movaps c(%esp), %xmm1 +movaps %xmm1, c2(%esp) +... + +xorps %xmm0, %xmm0 +movaps c2(%esp), %xmm1 +movss %xmm1, %xmm0 +ret + +However, since the reload is only used by these instructions, we could +"fold" it into the uses, producing something like this: + +movaps c(%esp), %xmm1 +movaps %xmm1, c2(%esp) +... + +movss c2(%esp), %xmm0 +ret + +... saving two instructions. + +The basic idea is that a reload from a spill slot, can, if only one 4-byte +chunk is used, bring in 3 zeros the the one element instead of 4 elements. +This can be used to simplify a variety of shuffle operations, where the +elements are fixed zeros. + +//===---------------------------------------------------------------------===// + +For this: + +#include <emmintrin.h> +void test(__m128d *r, __m128d *A, double B) { + *r = _mm_loadl_pd(*A, &B); +} + +We generates: + + subl $12, %esp + movsd 24(%esp), %xmm0 + movsd %xmm0, (%esp) + movl 20(%esp), %eax + movapd (%eax), %xmm0 + movlpd (%esp), %xmm0 + movl 16(%esp), %eax + movapd %xmm0, (%eax) + addl $12, %esp + ret + +icc generates: + + movl 4(%esp), %edx #3.6 + movl 8(%esp), %eax #3.6 + movapd (%eax), %xmm0 #4.22 + movlpd 12(%esp), %xmm0 #4.8 + movapd %xmm0, (%edx) #4.3 + ret #5.1 + +So icc is smart enough to know that B is in memory so it doesn't load it and +store it back to stack. + +//===---------------------------------------------------------------------===// + +__m128d test1( __m128d A, __m128d B) { + return _mm_shuffle_pd(A, B, 0x3); +} + +compiles to + +shufpd $3, %xmm1, %xmm0 + +Perhaps it's better to use unpckhpd instead? + +unpckhpd %xmm1, %xmm0 + +Don't know if unpckhpd is faster. But it is shorter. + +//===---------------------------------------------------------------------===// + +This code generates ugly code, probably due to costs being off or something: + +void %test(float* %P, <4 x float>* %P2 ) { + %xFloat0.688 = load float* %P + %loadVector37.712 = load <4 x float>* %P2 + %inFloat3.713 = insertelement <4 x float> %loadVector37.712, float 0.000000e+00, uint 3 + store <4 x float> %inFloat3.713, <4 x float>* %P2 + ret void +} + +Generates: + +_test: + pxor %xmm0, %xmm0 + movd %xmm0, %eax ;; EAX = 0! + movl 8(%esp), %ecx + movaps (%ecx), %xmm0 + pinsrw $6, %eax, %xmm0 + shrl $16, %eax ;; EAX = 0 again! + pinsrw $7, %eax, %xmm0 + movaps %xmm0, (%ecx) + ret + +It would be better to generate: + +_test: + movl 8(%esp), %ecx + movaps (%ecx), %xmm0 + xor %eax, %eax + pinsrw $6, %eax, %xmm0 + pinsrw $7, %eax, %xmm0 + movaps %xmm0, (%ecx) + ret + +or use pxor (to make a zero vector) and shuffle (to insert it). + +//===---------------------------------------------------------------------===// + +Some useful information in the Apple Altivec / SSE Migration Guide: + +http://developer.apple.com/documentation/Performance/Conceptual/ +Accelerate_sse_migration/index.html + +e.g. SSE select using and, andnot, or. Various SSE compare translations. |