Computer Chess Club Archives


Search

Terms

Messages

Subject: Re: is this really faster?

Author: Dieter Buerssner

Date: 09:34:22 04/19/03

Go up one level in this thread


On April 19, 2003 at 05:29:13, Gerd Isenberg wrote:

>Yes Dieter, i tried a lot.

Gerd, I have no doubt about this. I only questioned the 2 specific routines (the
loops on the 2 32 bit parts, once in assembler, once in C). I questioned it
only, because I looked at the generated assembler of the C-Code, which was
practically identical, to the assembly you showed.

Anyway, I made a small test program. dieter() is the C-Code I had shown, gerd()
your assembly (I wrote the same in GCC-style, too), and crafty() the routine Uri
showed in this thread. In my test, gerd() was not faster than dieter() (dieter()
was actually slightly faster than gerd() using MSVC, and the same using gcc). As
should be expected, with this specific test, crafty() was a bit slower (much
more with gcc than with MSVC). I used -O2 for both compilers. No doubt, one can
argue about my testing procedure. I actually made it out of fun only, not to do
some scientific experiments. I also used a union for bitboard, and changed for
this the functions in the thread slightly. I just did not want to try which
tricks would work best to convince the compiler to extract the 2 32-bit parts
efficiently.

Regards,
Dieter

#ifdef __GNUC__
#define MYINLINE static __inline__
typedef unsigned long long BITBOARD;
#else
#define MYINLINE static __forceinline
typedef unsigned __int64 BITBOARD;
#endif

/* Note, this is not strictly portable, even not the pure C code*/
typedef union
{
  BITBOARD b;
  unsigned long w[2];
} BU;


MYINLINE int crafty(register BU a) {
  register int c=0;

  while(a.b) {
    c++;
    a.b &= a.b - 1;
  }
  return(c);
}

MYINLINE int dieter(BU a)
{
  unsigned long w;
  int n = 0;
  w = a.w[0];
  if (w)
    do
    {
      n++;
    }
    while ((w &= w-1) != 0);
  w = a.w[1];
  if (w)
    do
    {
      n++;
    }
    while ((w &= w-1) != 0);
  return n;
}

#ifdef __GNUC__

MYINLINE int gerd(BU a)
{
  int tmp, tmp2, n;
  __asm__ __volatile__(
    "   movl %3, %1
        xorl %0, %0
        testl %1, %1
        je 1f
     0: incl %0
        leal -1(%1), %2
        andl %2, %1
        jne 0b
     1: movl %4, %1
        testl %1, %1
        je 3f
     2: incl %0
        leal -1(%1), %2
        andl %2, %1
        jne 2b
     3:"
    : "=r&" (n), "=r&" (tmp), "=r&" (tmp2)
    : "g" (a.w[0]), "g" (a.w[1])
    : "cc" /* Flags "condition code" changed */);
  return n;
}

#else

MYINLINE int gerd (BU bb)
{
	__asm
	{
		mov     ecx, dword ptr bb
		xor     eax, eax
		test    ecx, ecx
		jz      l1
	    l0: lea     edx, [ecx-1]
		inc     eax
		and     ecx, edx
		jnz     l0
	    l1: mov     ecx, dword ptr bb+4
		test    ecx, ecx
		jz      l3
	    l2: lea     edx, [ecx-1]
		inc     eax
		and     ecx, edx
		jnz     l2
	    l3:
	}
}
#endif

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

#define NLOOPS 10000000L
#define NBB 100 /* Number of different bitboards */
#define NBITS 4 /* That many bits set randomly, can be less */

#define PRNG rand
#define MY_RAND_MAX RAND_MAX
unsigned long rand_range(unsigned long range)
{
  unsigned long rmax, r, d;
  /* find the largest number rmax <= MY_RAND_MAX, for which
     (rmax+1) % range == 0.
     All returns from rand() > rmax will be skipped, to guarantee
     equal probability for all return values. */
  d = (MY_RAND_MAX+1U-range) / range + 1;
  rmax = d * range - 1; /* -1 to avoid "overflow to zero" */
  do
    r = PRNG();
  while (r > rmax);
  return r/d;
}

BU bbs[NBB];

/* Define functions in "wrong" order, to not let the compiler
   do stupid inlining, that can make the code slower */
unsigned long foo(void);
unsigned long bar(void);

int main(void)
{
  int i, j;
  clock_t c;
  /* Set some random bitboards */
  for (i=0; i<NBB; i++)
    for (j=0; j<NBITS; j++)
      bbs[i].b |= (BITBOARD)1 << rand_range(64);
  c = clock();
  /* Print number of counted bits, so we get an indicication,
     when something  goes wrong. My easily overflow. */
  printf("res %lu\n", bar());
  c = clock()-c;
  printf("used %.3f s\n", (double)c/CLOCKS_PER_SEC);
  return 0;
}

unsigned long bar(void)
{
  long i;
  unsigned long n=0;
  for (i=0; i<NLOOPS; i++)
    n += foo();
  return n;
}

unsigned long foo(void)
{
   int i;
   unsigned long n=0;
   for (i=0; i<NBB; i++)
     n += crafty(bbs[i]);
   return n;
}



This page took 0 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.