Author: Dieter Buerssner
Date: 12:23:40 12/18/03
Go up one level in this thread
On December 18, 2003 at 02:35:02, Tony Werten wrote: >In this case it will work but I don't think the compiler will do that without >RESTRICT. See example below. I see no reason, why it shouldn't. The other optimization I meantioned would not be possible (and not be done, of course). >Problems could arise if c points to somewhere in the a or b array. I don't think so. The method of my manual unrolling will work in any case the original code worked. If c aliases a or b, the original code probably did not work as intended, but the unrolled code will give the same result. A test case; foo is just to give a chance for inlining and see, that everything got unrolled (same if it would be called with len=10). C:\src>gcc -O3 -S -funroll-loops d.c C:\src>cat d.c void vectoradd(double *a, double *b, double *c, unsigned len) { unsigned i; for(i = 0; i < len; i++) c[i+1] = b[i] + a[i]; } void foo(double *a, double *b, double *c) { vectoradd(a,b,c,4); } C:\src>gcc -O3 -S -funroll-loops d.c C:\src>cat d.s .file "d.c" .section .text .p2align 1 .p2align 4,,15 .globl _vectoradd _vectoradd: pushl %ebp xorl %edx, %edx movl %esp, %ebp pushl %edi pushl %esi movl 20(%ebp), %edi pushl %ebx movl 8(%ebp), %esi movl 16(%ebp), %ecx movl 12(%ebp), %ebx cmpl %edi, %edx jae L8 movl %edi, %eax andl $3, %eax cmpl $1, %edi ja L28 L11: fldl (%esi,%edx,8) faddl (%ebx,%edx,8) fstpl 8(%ecx,%edx,8) incl %edx cmpl %edi, %edx jae L8 ; loop unrolled by four .p2align 4,,7 L6: fldl (%esi,%edx,8) leal 1(%edx), %eax faddl (%ebx,%edx,8) fstpl 8(%ecx,%edx,8) fldl (%esi,%eax,8) faddl (%ebx,%eax,8) fstpl 8(%ecx,%eax,8) leal 2(%edx), %eax fldl (%esi,%eax,8) faddl (%ebx,%eax,8) fstpl 8(%ecx,%eax,8) leal 3(%edx), %eax addl $4, %edx cmpl %edi, %edx fldl (%esi,%eax,8) faddl (%ebx,%eax,8) fstpl 8(%ecx,%eax,8) jb L6 L8: popl %ebx popl %esi popl %edi popl %ebp ret L28: testl %eax, %eax je L6 cmpl $1, %eax jle L11 cmpl $2, %eax jle L12 fldl (%esi) movl $1, %edx faddl (%ebx) fstpl 8(%ecx) L12: fldl (%esi,%edx,8) faddl (%ebx,%edx,8) fstpl 8(%ecx,%edx,8) incl %edx jmp L11 .p2align 1 .p2align 4,,15 .globl _foo _foo: pushl %ebp movl %esp, %ebp movl 8(%ebp), %eax movl 12(%ebp), %edx movl 16(%ebp), %ecx ; everything unrolled fldl (%eax) faddl (%edx) fstpl 8(%ecx) fldl 8(%eax) faddl 8(%edx) fstpl 16(%ecx) fldl 16(%eax) faddl 16(%edx) fstpl 24(%ecx) fldl 24(%eax) faddl 24(%edx) fstpl 32(%ecx) popl %ebp ret .ident "GCC: (GNU) 3.2" Regards, Dieter
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.