Author: Dieter Buerssner
Date: 12:23:40 12/18/03
Go up one level in this thread
On December 18, 2003 at 02:35:02, Tony Werten wrote:
>In this case it will work but I don't think the compiler will do that without
>RESTRICT.
See example below. I see no reason, why it shouldn't. The other optimization I
meantioned would not be possible (and not be done, of course).
>Problems could arise if c points to somewhere in the a or b array.
I don't think so. The method of my manual unrolling will work in any case the
original code worked. If c aliases a or b, the original code probably did not
work as intended, but the unrolled code will give the same result. A test case;
foo is just to give a chance for inlining and see, that everything got unrolled
(same if it would be called with len=10).
C:\src>gcc -O3 -S -funroll-loops d.c
C:\src>cat d.c
void vectoradd(double *a, double *b, double *c, unsigned len)
{
unsigned i;
for(i = 0; i < len; i++)
c[i+1] = b[i] + a[i];
}
void foo(double *a, double *b, double *c)
{
vectoradd(a,b,c,4);
}
C:\src>gcc -O3 -S -funroll-loops d.c
C:\src>cat d.s
.file "d.c"
.section .text
.p2align 1
.p2align 4,,15
.globl _vectoradd
_vectoradd:
pushl %ebp
xorl %edx, %edx
movl %esp, %ebp
pushl %edi
pushl %esi
movl 20(%ebp), %edi
pushl %ebx
movl 8(%ebp), %esi
movl 16(%ebp), %ecx
movl 12(%ebp), %ebx
cmpl %edi, %edx
jae L8
movl %edi, %eax
andl $3, %eax
cmpl $1, %edi
ja L28
L11:
fldl (%esi,%edx,8)
faddl (%ebx,%edx,8)
fstpl 8(%ecx,%edx,8)
incl %edx
cmpl %edi, %edx
jae L8
; loop unrolled by four
.p2align 4,,7
L6:
fldl (%esi,%edx,8)
leal 1(%edx), %eax
faddl (%ebx,%edx,8)
fstpl 8(%ecx,%edx,8)
fldl (%esi,%eax,8)
faddl (%ebx,%eax,8)
fstpl 8(%ecx,%eax,8)
leal 2(%edx), %eax
fldl (%esi,%eax,8)
faddl (%ebx,%eax,8)
fstpl 8(%ecx,%eax,8)
leal 3(%edx), %eax
addl $4, %edx
cmpl %edi, %edx
fldl (%esi,%eax,8)
faddl (%ebx,%eax,8)
fstpl 8(%ecx,%eax,8)
jb L6
L8:
popl %ebx
popl %esi
popl %edi
popl %ebp
ret
L28:
testl %eax, %eax
je L6
cmpl $1, %eax
jle L11
cmpl $2, %eax
jle L12
fldl (%esi)
movl $1, %edx
faddl (%ebx)
fstpl 8(%ecx)
L12:
fldl (%esi,%edx,8)
faddl (%ebx,%edx,8)
fstpl 8(%ecx,%edx,8)
incl %edx
jmp L11
.p2align 1
.p2align 4,,15
.globl _foo
_foo:
pushl %ebp
movl %esp, %ebp
movl 8(%ebp), %eax
movl 12(%ebp), %edx
movl 16(%ebp), %ecx
; everything unrolled
fldl (%eax)
faddl (%edx)
fstpl 8(%ecx)
fldl 8(%eax)
faddl 8(%edx)
fstpl 16(%ecx)
fldl 16(%eax)
faddl 16(%edx)
fstpl 24(%ecx)
fldl 24(%eax)
faddl 24(%edx)
fstpl 32(%ecx)
popl %ebp
ret
.ident "GCC: (GNU) 3.2"
Regards,
Dieter
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.