Computer Chess Club Archives

Search

Terms
Messages

Subject: Re: is this really faster?

Author: Filip Tvrzsky
Date: 16:31:15 04/21/03
On April 19, 2003 at 12:34:22, Dieter Buerssner wrote:

>On April 19, 2003 at 05:29:13, Gerd Isenberg wrote:
>
>>Yes Dieter, i tried a lot.
>
>Gerd, I have no doubt about this. I only questioned the 2 specific routines (the
>loops on the 2 32 bit parts, once in assembler, once in C). I questioned it
>only, because I looked at the generated assembler of the C-Code, which was
>practically identical, to the assembly you showed.
>
>Anyway, I made a small test program. dieter() is the C-Code I had shown, gerd()
>your assembly (I wrote the same in GCC-style, too), and crafty() the routine Uri
>showed in this thread. In my test, gerd() was not faster than dieter() (dieter()
>was actually slightly faster than gerd() using MSVC, and the same using gcc). As
>should be expected, with this specific test, crafty() was a bit slower (much
>more with gcc than with MSVC). I used -O2 for both compilers. No doubt, one can
>argue about my testing procedure. I actually made it out of fun only, not to do
>some scientific experiments. I also used a union for bitboard, and changed for
>this the functions in the thread slightly. I just did not want to try which
>tricks would work best to convince the compiler to extract the 2 32-bit parts
>efficiently.
>
>Regards,
>Dieter
>
>#ifdef __GNUC__
>#define MYINLINE static __inline__
>typedef unsigned long long BITBOARD;
>#else
>#define MYINLINE static __forceinline
>typedef unsigned __int64 BITBOARD;
>#endif
>
>/* Note, this is not strictly portable, even not the pure C code*/
>typedef union
>{
>  BITBOARD b;
>  unsigned long w[2];
>} BU;
>
>
>MYINLINE int crafty(register BU a) {
>  register int c=0;
>
>  while(a.b) {
>    c++;
>    a.b &= a.b - 1;
>  }
>  return(c);
>}
>
>MYINLINE int dieter(BU a)
>{
>  unsigned long w;
>  int n = 0;
>  w = a.w[0];
>  if (w)
>    do
>    {
>      n++;
>    }
>    while ((w &= w-1) != 0);
>  w = a.w[1];
>  if (w)
>    do
>    {
>      n++;
>    }
>    while ((w &= w-1) != 0);
>  return n;
>}
>
>#ifdef __GNUC__
>
>MYINLINE int gerd(BU a)
>{
>  int tmp, tmp2, n;
>  __asm__ __volatile__(
>    "   movl %3, %1
>        xorl %0, %0
>        testl %1, %1
>        je 1f
>     0: incl %0
>        leal -1(%1), %2
>        andl %2, %1
>        jne 0b
>     1: movl %4, %1
>        testl %1, %1
>        je 3f
>     2: incl %0
>        leal -1(%1), %2
>        andl %2, %1
>        jne 2b
>     3:"
>    : "=r&" (n), "=r&" (tmp), "=r&" (tmp2)
>    : "g" (a.w[0]), "g" (a.w[1])
>    : "cc" /* Flags "condition code" changed */);
>  return n;
>}
>
>#else
>
>MYINLINE int gerd (BU bb)
>{
>	__asm
>	{
>		mov     ecx, dword ptr bb
>		xor     eax, eax
>		test    ecx, ecx
>		jz      l1
>	    l0: lea     edx, [ecx-1]
>		inc     eax
>		and     ecx, edx
>		jnz     l0
>	    l1: mov     ecx, dword ptr bb+4
>		test    ecx, ecx
>		jz      l3
>	    l2: lea     edx, [ecx-1]
>		inc     eax
>		and     ecx, edx
>		jnz     l2
>	    l3:
>	}
>}
>#endif
>
>#include <stdlib.h>
>#include <stdio.h>
>#include <time.h>
>
>#define NLOOPS 10000000L
>#define NBB 100 /* Number of different bitboards */
>#define NBITS 4 /* That many bits set randomly, can be less */
>
>#define PRNG rand
>#define MY_RAND_MAX RAND_MAX
>unsigned long rand_range(unsigned long range)
>{
>  unsigned long rmax, r, d;
>  /* find the largest number rmax <= MY_RAND_MAX, for which
>     (rmax+1) % range == 0.
>     All returns from rand() > rmax will be skipped, to guarantee
>     equal probability for all return values. */
>  d = (MY_RAND_MAX+1U-range) / range + 1;
>  rmax = d * range - 1; /* -1 to avoid "overflow to zero" */
>  do
>    r = PRNG();
>  while (r > rmax);
>  return r/d;
>}
>
>BU bbs[NBB];
>
>/* Define functions in "wrong" order, to not let the compiler
>   do stupid inlining, that can make the code slower */
>unsigned long foo(void);
>unsigned long bar(void);
>
>int main(void)
>{
>  int i, j;
>  clock_t c;
>  /* Set some random bitboards */
>  for (i=0; i<NBB; i++)
>    for (j=0; j<NBITS; j++)
>      bbs[i].b |= (BITBOARD)1 << rand_range(64);
>  c = clock();
>  /* Print number of counted bits, so we get an indicication,
>     when something  goes wrong. My easily overflow. */
>  printf("res %lu\n", bar());
>  c = clock()-c;
>  printf("used %.3f s\n", (double)c/CLOCKS_PER_SEC);
>  return 0;
>}
>
>unsigned long bar(void)
>{
>  long i;
>  unsigned long n=0;
>  for (i=0; i<NLOOPS; i++)
>    n += foo();
>  return n;
>}
>
>unsigned long foo(void)
>{
>   int i;
>   unsigned long n=0;
>   for (i=0; i<NBB; i++)
>     n += crafty(bbs[i]);
>   return n;
>}

It is very strange, but my test results are rather different!
Compiler is gcc 3.2, CPU is Duron 600@900.

1) With only -O3 optimization:
   res 3880000000
   gerd()          32,520 s
   crafty()        47,950 s
   dieter()        47,010 s

2) With following optimization settings set (-O3 and -fsave-memoized
-fno-exceptions -fmerge-all-constants -save-temps -march=athlon -mcpu=athlon
-mmmx -funroll-loops -fomit-frame-pointer) which is my favorite now:
   res 3880000000
   gerd()          18,350 s
   crafty()        37,290 s
   dieter()        41,750 s

Maybe, I should little bit investigate compiler assembler output ...
Filip
Re: is this really faster? Filip Tvrzsky 16:58:42 04/21/03
- Re: is this really faster? Dieter Buerssner 13:00:05 04/22/03
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.