Computer Chess Club Archives

Search

Terms
Messages

Subject: Re: Data point for x86 bitboard programmers

Author: Dieter Buerssner
Date: 13:06:30 05/17/05
Steven, this looks, like you could still get a few percent without much effort.
It seems, you didn't inline NextSq(). Perhaps the compiler does it
automatically, if it can see the definition of the function when it is called
(or perhaps newer Gcc support inter module optimization?). It is also very
possible, that how you wrote NextSq(), it is a bit too complicated to gain much
from inlining. Doing both bitscan upcodes inside one routine directly should
make it efficient enough. The routine I show for "Last_one" (seems there is some
confusion about what is last/first - at least for me) does this. Unfortunately,
it depends on some undocumented behaviour. But it works on all known x86
processors until now. Otherwise, one would have to add one branch.

BITSCAN_INLINE unsigned int asmdirty_LastOne(BITBOARD a)
{
  unsigned int res, tmp;
  __asm__ __volatile__
    ("xorl %1, %1 \n"
     "bsrl %2, %0 \n"
     "bsrl %3, %0 \n"
     "setnz %b1 \n"
     "shll $5, %1 \n"
     "addl %1, %0\n"
#if __ICC /* ICC versions I tried, did not support the "&". They rather
             seem to assume, that it is always there. Or perhaps they can
             conclude it from analysing the source code */
     : "=r" (res), "=q" (tmp)
#else
     : "=r&" (res), "=q&" (tmp)
#endif
     : "rm"((unsigned int)a), "rm" ((unsigned int)(a>>32))
     : "cc");
  return res;
}

BITSCAN_INLINE unsigned int asmdirty_FirstOne(BITBOARD a)
{
  unsigned int res;
  __asm__ __volatile__
    ("bsfl %2, %0 \n"
     "addl $32, %0 \n"
     "bsfl %1, %0 \n"
#if __ICC
     : "=r" (res)
#else
     : "=r&" (res)
#endif
     : "rm"((unsigned int)a), "rm" ((unsigned int)(a>>32))
     : "cc");
  return res;
}

One often also has to clear the bit, which might be done a little bit more
efficient in the low level routine. With the disadvantage, that you have to call
by reference, which typically makes less optimizations possible for the
compiler. Sometimes, compilers are not too clever, to optimize the extraction of
the two 32 bit words from a 64 bit word. Using a union for BITBOARD will help
them (and make the code even uglier, and more disadvantages).

I also give a small test program with many bitscanning routines. With little
setup of #defines, it should work. But it is a hack, not too serious.

Regards,
Dieter

/* Note, the code uses C++/C99 style comments in some parts */

/* Speed test several bitscanning routines. Many
   are taken from CCC. Therfore, they are also not coded
   really consistently.  They typically assume, that
   they will not be called with zero. Not all are
   totally portable. They may for example assume, that unsigned int
   is a 32 bit type. To adapt to another system schould be
   easy, however.
   Various people seem to use FirsOne and LastOne just the other
   way around. */
/* Define one of the function here, or preferrably
   through compiler option -Dtestfunc=...*/
#ifndef testfunc
#define testfunc FindFirst_GCP_manual_mult
#endif

#ifndef NBB
#define NBB (100) /* Number of different bitboards for the test loop */
#endif
#define NLOOPS (0xffffffffUL/64/NBB) /* To never let check result overflow */

#ifdef __GNUC__ /* Probably works also for Intel CC under Linux */
#define MYINLINE static __inline__
typedef unsigned long long BITBOARD;
#define HAVE_GCC_X86_INLINE_ASSEMBLY 1 /* redefine to zero on other platforms */
#else /* assume MSVC like compiler, works perhaps also with ICC */
#define MYINLINE static __forceinline
typedef unsigned __int64 BITBOARD;
#define HAVE_MSVC_X86_INLINE_ASSEMBLY 1
#endif

#define BITSCAN_INLINE MYINLINE
typedef unsigned char LTBL_T; /* Used for lookup tables */
typedef unsigned int UINT32;

#define str(x) #x
#define xstr(x) str(x)
#define tftext xstr(testfunc) /* Make testfunc a string */

static unsigned int (*checkfunc)(BITBOARD);

/* Speed does not matter, just for correctness test */
unsigned simple_last_one(BITBOARD b)
{
  unsigned r=63;
  if (b == 0)
    return 0;
  while ((b & ((BITBOARD)1 << r)) == 0)
    r--;
  return r;
}

unsigned simple_first_one(BITBOARD b)
{
  unsigned r=0;
  if (b == 0)
    return 0;
  while ((b & 1) == 0)
  {
    b >>= 1;
    r++;
  }
  return r;
}


const LTBL_T LSB_64_table[154] =
    {
#define __  0
    23,__,__,__,31,__,__,39,19,__, 17,16,18,__,47,10,20, 9, 8,11,
     1, 0, 2,57,56,58, 3,12,__,59, __,__,21,__, 4,__,__,60,__,__,
    __,__,__,13,__,__,__,__,__,__,  5,__,__,61,__,__,__,__,__,__,
    __,__,__,__,22,__,__,__,30,__, __,38,__,__,__,14,__,__,46,__,
    __,__, 6,__,__,62,__,__,__,54, __,__,__,__,__,__,__,__,__,__,
    29,__,__,37,__,__,__,__,__,__, 45,__,__,__,__,__,28,__,__,36,
    __,53,__,__,27,__,44,35,26,24, 25,34,32,33,43,40,41,52,42,15,
    __,50,48,49,__,51, 7,__,__,63, __,__,__,55
#undef __
    };

#define LSB_64_adj  -51                         // offset to table base
#define LSB_64_magic  ( (UINT32)0x01C5FC81 )       // magic constant

BITSCAN_INLINE unsigned int WalterFaxonsMagicBitscan(BITBOARD bb)
{
    BITBOARD t64;
    UINT32 t32;
    t64 = bb - 1;
    bb &= t64;  /* omit this line to retain current LSB */
    t64 ^= bb;
    t32 = (UINT32)t64 ^ (UINT32)(t64 >> 32);
    t32 ^= LSB_64_magic;
    t32 += t32 >> 16;
    t32 -= t32 >> 8;
    return LSB_64_table [LSB_64_adj + (unsigned char)t32];
}

const BITBOARD magic = 0x03f08c5392f756cdULL;
const unsigned int magicl = 0x92f756cd, magich=0x03f08c53;

const unsigned char magictable[64] = {
0, 1, 12, 2, 13, 22, 17, 3,
14, 33, 23, 36, 18, 58, 28, 4,
62, 15, 34, 26, 24, 48, 50, 37,
19, 55, 59, 52, 29, 44, 39, 5,
63, 11, 21, 16, 32, 35, 57, 27,
61, 25, 47, 49, 54, 51, 43, 38,
10, 20, 31, 56, 60, 46, 53, 42,
9, 30, 45, 41, 8, 40, 7, 6,
};

BITSCAN_INLINE unsigned int FindFirst_GCP(const BITBOARD b) {
 const BITBOARD lsb = b & -b;
 return magictable[(lsb * magic) >> 58];
}

BITSCAN_INLINE unsigned int FindFirst_GCP_manual_mult(BITBOARD b)
{
  unsigned int bl, bh, r;
  b &= -b;
  bh = b >> 32;
  bl = b;
  r = ((bl*(BITBOARD)magicl)>>32) + bl*magich + bh*magicl;
  return magictable[r>>26];
}

const LTBL_T lsz64_tbl[154] =
{
 63,30, 3,32,59,14,11,33,
 60,24,50, 9,55,19,21,34,
 61,29, 2,53,51,23,41,18,
 56,28, 1,43,46,27, 0,35,
 62,31,58, 4, 5,49,54, 6,
 15,52,12,40, 7,42,45,16,
 25,57,48,13,10,39, 8,44,
 20,47,38,22,17,37,36,26,
};

BITSCAN_INLINE unsigned int MattTaylorsLastOne(BITBOARD bb)
{
    BITBOARD lsb = bb ^ (bb - 1);
#if 0
    bb &= ~lsb;  /* omit this line to retain current LSB */
#endif
    unsigned int foldedLSB = ((int) lsb) ^ ((int)(lsb>>32));
    return lsz64_tbl[foldedLSB * 0x78291ACF >> 26];
}

#if HAVE_MSVC_X86_INLINE_ASSEMBLY
/* From Gerd */
BITSCAN_INLINE unsigned int asmdirty_FirstOne(BITBOARD bb)
{
  __asm
  {
    bsf    eax, [bb+4]
    xor    eax, 32
    bsf    eax, [bb]
  }
}

BITSCAN_INLINE unsigned int asmdirty_LastOne(BITBOARD bb)
{
  __asm
  {
    xor    edx, edx
    bsr    eax, [bb]
    bsr    eax, [bb+4]
    setnz  dl
    shl    edx, 5
    add	   eax, edx
  }
}
#endif

#if HAVE_GCC_X86_INLINE_ASSEMBLY
BITSCAN_INLINE unsigned int asmdirty_FirstOne(BITBOARD a)
{
  unsigned int res;
  __asm__ __volatile__
    ("bsfl %2, %0 \n"
     "addl $32, %0 \n"
     "bsfl %1, %0 \n"
#if __ICC
     : "=r" (res)
#else
     : "=r&" (res)
#endif
     : "rm"((unsigned int)a), "rm" ((unsigned int)(a>>32))
     : "cc");
  return res;
}

#if 1
BITSCAN_INLINE unsigned int asmdirty_LastOne(BITBOARD a)
{
  unsigned int res, tmp;
  __asm__ __volatile__
    ("xorl %1, %1 \n"
     "bsrl %2, %0 \n"
     "bsrl %3, %0 \n"
     "setnz %b1 \n"
     "shll $5, %1 \n"
     "addl %1, %0\n"
#if __ICC
     : "=r" (res), "=q" (tmp)
#else
     : "=r&" (res), "=q&" (tmp)
#endif
     : "rm"((unsigned int)a), "rm" ((unsigned int)(a>>32))
     : "cc");
  return res;
}

#else
/* Shown by Bob Hyatt in CCC, attributed to Eugene Nalimov
cmpl $1, 8(%esp)
sbbl %eax, %eax
movl 8(%esp,%eax,4), %edx
bsr %edx, %ecx
jz l4
andl $32, %eax
subl $31, %ecx
subl %ecx, %eax
ret
l4: movl $64, %eax
ret
*/

/* Gcc seems to misoptimize this, depending on specific conditions
   when called. Works fine with Icc, but slower than above */
BITSCAN_INLINE unsigned int asmdirty_LastOne(BITBOARD a)
{
  unsigned int t, r;
  __asm__ __volatile__(
    "cmpl $1, 4(%2)    \n"
    "sbbl %0, %0    \n"
    "movl 4(%2,%0,4), %1    \n"
    "bsrl %1, %1    \n"
    "notl %0 \n"
    "andl $32, %0    \n"
    "addl %1, %0 \n"
#if __ICC
    : "=r" (r), "=r" (t)
#else
    : "=r&" (r), "=r" (t)
#endif
    : "r" (&a)
    : "cc"
  );
  return r;
}
#endif

#endif

LTBL_T lo_ltbl[0x10000];

BITSCAN_INLINE unsigned int ltbl8_LastOne(BITBOARD arg)
{
  unsigned int result = 0, t;

  if (arg > 0xFFFFFFFF)
  {
    arg >>= 32;
    result = 32;
  }
  t = arg;
  if (t > 0xFFFF)
  {
    t >>= 16;
    result += 16;
  }

  if (t > 0xFF)
  {
    t >>= 8;
    result += 8;
  }

  return result + lo_ltbl[t];
}

BITSCAN_INLINE unsigned int ltbl16_LastOne(BITBOARD arg)
{
  unsigned int result = 0, t;

  if (arg > 0xFFFFFFFF)
  {
    arg >>= 32;
    result = 32;
  }
  t = arg;
  if (t > 0xFFFF)
  {
    t >>= 16;
    result += 16;
  }
  return result + lo_ltbl[t];
}


BITSCAN_INLINE unsigned int log2_LastOne(BITBOARD arg)
{
  unsigned int r = 0, t;

  if (arg > 0xFFFFFFFF)
  {
    arg >>= 32;
    r = 32;
  }
  t = arg;
  if (t & 0xFFFF0000) { t &= 0xFFFF0000; r+=16; }
  if (t & 0xFF00FF00) { t &= 0xFF00FF00; r+=8; }
  if (t & 0xF0F0F0F0) { t &= 0xF0F0F0F0; r+=4; }
  if (t & 0xCCCCCCCC) { t &= 0xCCCCCCCC; r+=2; }
  if (t & 0xAAAAAAAA)  r+=1;
  return r;
}

/* Walter Faxon's implementation of Matt Taylor's routine */
/* ---------------------------------------------------------------------------
   Should we use a union to force better alignment of this table? */
static const LTBL_T MT32table [32] = {
0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };

#define MT32magic (130329821UL)

/* Return index of least-significant bit of 64; argument must be non-zero.
"Magic" indexing algorithm by Matt Taylor.
Version for 32-bit architecture by WF.
No warranty.
*/
BITSCAN_INLINE unsigned int leastSigBit64( BITBOARD bb )
{
  unsigned long bb0, bb1;
  unsigned int bbHalf;

  bb1 = bb>>32;
  bb0 = bb;
  bbHalf = (bb0 == 0);
  if (bbHalf) bb0 = bb1; /* will code as cmov (ideally) */
  /* buers: icc and gcc can indeed do it, but does not seem to
     help much */
  bb0 ^= bb0 - 1;
  bb0 *= MT32magic;
  return (bbHalf << 5) | MT32table[bb0 >> 27];
  /* if bbHalf in byte-addressable register, bitwise-or
     preferred to avoid int+char type/sizing conflict */
}

#define COMBINE_COUNTERS(x,m,s) x = (x&m) + ((x>>s)&m)

MYINLINE unsigned int bitfiddling(BITBOARD b)
{
  unsigned long u, v;

  v = b>>32;
  u = b;

  COMBINE_COUNTERS(u, 0x55555555, 1);
  COMBINE_COUNTERS(v, 0x55555555, 1);
  /* Depending on compiler, optimzation options, free registers,
     specific inlining situation, one or the other might be faster. */
#if 0
  COMBINE_COUNTERS(u, 0x33333333, 2);
  COMBINE_COUNTERS(v, 0x33333333, 2);
  /* Now the counters have space to add the 2 parts
     without overflow */
  u += v;
#else
  u = (u & 0x33333333) + ((u >> 2) & 0x33333333)
       + (v & 0x33333333) + ((v >> 2) & 0x33333333);
#endif
  COMBINE_COUNTERS(u, 0x0f0f0f0f, 4);
  /* return u%255; */
  /* return (u * 0x01010101) >> 24; */
  /* COMBINE_COUNTERS(u, 0x00ff00ff, 8); */
  u += u >> 8;
  /* COMBINE_COUNTERS(u, 0x0000ffff, 16); */
  u += u >> 16;
  return u & 0xff;  /* 0x7f would be enough as mask. */
}


static LTBL_T btbl16[0x10000];

MYINLINE int lookup16(BITBOARD b)
{
  unsigned long v, w;
  LTBL_T c;
  w = b>>32;
  v = b;
  /* Depending on compiler, optimzation options, free registers,
     specific inlining situation, one or the other might be faster. */
#if 1
  c = btbl16[w & 0xffff];
  c += btbl16[v & 0xffff];
  w >>= 16;
  v >>= 16;
  c += btbl16[w /* & 0xffff*/];
  c += btbl16[v /* & 0xffff*/];
#endif
#if 0
  c = btbl16[w & 0xffff] + btbl16[v & 0xffff] + btbl16[w >> 16] + btbl16[v >>
16];
#endif
  return c;
}

/* static LTBL_T btbl11[0x800]; */
#define btbl11 btbl16

MYINLINE int lookup11(BITBOARD b)
{
  unsigned long v, w;
  LTBL_T c;
  w = b>>32;
  v = b;
  w = b>>32;
  v = b;
  c = btbl11[w & 0x7ff] + btbl11[v & 0x7ff]
      + btbl11[(w>>11)&0x7ff] + btbl11[(v>>11)&0x7ff]
      + btbl11[w>>22] + btbl11[v>>22];
  return c;
}

/* static LTBL_T btbl8[0x100]; */
#define btbl8 btbl16


MYINLINE int lookup8(BITBOARD b)
{
  unsigned long v,w;
  LTBL_T c;
  w = b>>32;
  v = b;
  c = btbl8[w & 0xff];
  w >>= 8;
  c += btbl8[w & 0xff];
  w >>= 8;
  c += btbl8[w & 0xff];
  w >>= 8;
  c += btbl8[w /* & 0xff*/];
  w = v;
  c += btbl8[w & 0xff];
  w >>= 8;
  c += btbl8[w & 0xff];
  w >>= 8;
  c += btbl8[w & 0xff];
  w >>= 8;
  c += btbl8[w /* & 0xff*/];
#if 0
  w = bb.w[0];
  v = bb.w[1];

  c = btbl8[w & 0xff] + btbl8[v & 0xff]
      + btbl8[(w>>8)&0xff] + btbl8[(v>>8)&0xff]
      + btbl8[(w>>16)&0xff] + btbl8[(v>>16)&0xff]
      + btbl8[w>>24] + btbl8[v>>24];
#endif
#if 0
  unsigned char *pc = bb.bytes;
  c = btbl8[pc[0]] + btbl8[pc[1]] + btbl8[pc[2]] + btbl8[pc[3]]
      + btbl8[pc[4]] + btbl8[pc[5]] + btbl8[pc[6]] + btbl8[pc[7]];
#endif
  return c;
}


BITSCAN_INLINE unsigned int popcount_bitscan(BITBOARD b)
{
  b = (b & -b)-1;
  // return bitfiddling(b);
  return lookup16(b);
}


/* Test the different functions */

#include <stdlib.h>
#include <stdio.h>
#include <time.h>

#define PRNG rand
#define MY_RAND_MAX RAND_MAX
unsigned long rand_range(unsigned long range)
{
  unsigned long rmax, r, d;
  /* find the largest number rmax <= MY_RAND_MAX, for which
     (rmax+1) % range == 0.
     All returns from rand() > rmax will be skipped, to guarantee
     equal probability for all return values. */
  d = (MY_RAND_MAX+1U-range) / range + 1;
  rmax = d * range - 1; /* -1 to avoid "overflow to zero" */
  do
    r = PRNG();
  while (r > rmax);
  return r/d;
}

BITBOARD bbs[NBB];

/* Define functions in "wrong" order, to not let the compiler
   do stupid inlining, that can make the code slower */
unsigned long foo(void);
unsigned long bar(void);

static void show_errors(void)
{
  unsigned r1, r2;
  int i, j;
  for (i=0; i<NBB; i++)
  {
    r1 = checkfunc(bbs[i]);
    r2 = testfunc(bbs[i]);
    if (r1 != r2)
    {
    	for (j=63; j>=0; j--)
    	  printf("%d", (int)((bbs[i]>>j)&1));
    	printf(":r1=%2u,r2=%2u\n", r1, r2);
    }
  }
}

int main(void)
{
  int i, j, nbits,nl=0,k;
  unsigned long expect, result, b;
  clock_t c, sum=0;
  double t;

  printf("Testing %s %s\n", xstr(BITSCAN_INLINE), tftext);

  /* Initialize the lookup table, use one table for the different functions */

  for (b=0; b < sizeof lo_ltbl/sizeof lo_ltbl[0]; b++)
    lo_ltbl[b] = simple_last_one(b);
  for (b=0; b < sizeof btbl16/sizeof btbl16[0]; b++)
    btbl16[b] = bitfiddling(b);

  if (testfunc(3)==1)
    checkfunc = simple_last_one;
  else
    checkfunc = simple_first_one;

  for (nbits = 1; nbits <= 128; nbits = nbits < 8 ? nbits+1 : nbits*2)
  {
    expect = 0;
    /* Set some random bitboards */
    for (i=0; i<NBB; i++)
    {
      bbs[i] = 0;
      for (j=0; j<nbits; j++)
        bbs[i] |= (BITBOARD)1 << rand_range(64);
      expect += checkfunc(bbs[i]);
    }
    expect *= NLOOPS;
    c = clock();
    for (k=0;k<10;k++)
    result = bar();
    c = clock()-c;
    sum += c;
    printf("%3d: %7.3f s", nbits, (double)c/CLOCKS_PER_SEC);
    if (expect != result)
    {
      printf(" ERROR: expected %lu, result %lu", expect, result);
      show_errors();
    }
    printf("\n");
    nl++;
  }
  t = (double)sum / CLOCKS_PER_SEC;
  printf("sum: %7.3f s for %.0f calls, %.3f ns/call\n",
         t, (double)nl*NLOOPS*NBB,
         t*1e9/((double)nl*NLOOPS*NBB));
  return 0;
}

unsigned long bar(void)
{
  long i;
  unsigned long n=0;
  for (i=0; i<NLOOPS; i++)
    n += foo();
  return n;
}

/* May inline testfunc here */
unsigned long foo(void)
{
   int i;
   unsigned long n=0;
   for (i=0; i<NBB; i++)
     n += testfunc(bbs[i]);
   return n;
}
Re: Data point for x86 bitboard programmers Steven Edwards 15:58:31 05/17/05
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.