Author: Dieter Buerssner
Date: 13:15:17 06/14/04
Go up one level in this thread
I collected some suggested routines. See for the source code later. My results
on P4 2.53 GHz:
With Gcc 3.1
Testing static __inline__ ltbl8_LastOne
sum: 46.506 s for 805305600 calls, 57.750 ns/call
Testing static __inline__ ltbl16_LastOne
sum: 36.462 s for 805305600 calls, 45.277 ns/call
Testing static __inline__ log2_LastOne
sum: 112.842 s for 805305600 calls, 140.123 ns/call
Testing static __inline__ FindFirst_GCP
sum: 89.909 s for 805305600 calls, 111.646 ns/call
Testing static __inline__ FindFirst_GCP_manual_mult
sum: 70.942 s for 805305600 calls, 88.093 ns/call
Testing static __inline__ MattTaylorsLastOne
sum: 33.237 s for 805305600 calls, 41.273 ns/call
Testing static __inline__ leastSigBit64
sum: 40.678 s for 805305600 calls, 50.513 ns/call
Testing static __inline__ WalterFaxonsMagicBitscan
sum: 42.120 s for 805305600 calls, 52.303 ns/call
Testing static __inline__ asmdirty_FirstOne
sum: 32.596 s for 805305600 calls, 40.477 ns/call
Testing static __inline__ asmdirty_LastOne
sum: 33.608 s for 805305600 calls, 41.733 ns/call
With MSVC 7.0 free command line tools
Testing static __forceinline ltbl8_LastOne
sum: 34.850 s for 805305600 calls, 43.275 ns/call
Testing static __forceinline ltbl16_LastOne
sum: 27.109 s for 805305600 calls, 33.663 ns/call
Testing static __forceinline log2_LastOne
sum: 75.308 s for 805305600 calls, 93.515 ns/call
Testing static __forceinline FindFirst_GCP
sum: 159.459 s for 805305600 calls, 198.011 ns/call
Testing static __forceinline FindFirst_GCP_manual_mult
sum: 69.469 s for 805305600 calls, 86.264 ns/call
Testing static __forceinline MattTaylorsLastOne
sum: 29.602 s for 805305600 calls, 36.759 ns/call
Testing static __forceinline leastSigBit64
sum: 32.757 s for 805305600 calls, 40.676 ns/call
Testing static __forceinline WalterFaxonsMagicBitscan
sum: 36.172 s for 805305600 calls, 44.917 ns/call
Testing static __forceinline asmdirty_FirstOne
sum: 27.349 s for 805305600 calls, 33.961 ns/call
Testing static __forceinline asmdirty_LastOne
sum: 30.413 s for 805305600 calls, 37.766 ns/call
There are significant differences between those two compilers. Especially the
log2 method and the GCP[_manual_mult]. With MSVC, the manual_mult is more than
twice faster. With Gcc a bit faster. On Linux, with ICC it was the same speed.
Perhaps I did the calculation wrong. Is it really possible, that a (the fastest
in my test) bitscan takes 80 cycles?
I only have Icc under Linux, and cannot access the results at the moment, but
they were similar. The attached source code does some loop testing, not
sophisticated, but also not totally primitive. With the default setup, it tests
the selected function several times on 100 randomly selected bitboards. This is
done several times, with increasing nunber of set bits (on average). I also
attach a simple shell script, that will test all functions automatically with
the compiler of your choice and your favorite optimization options. Under
Windows it will work, when you have Cygwin installed, or any other /bin/sh
adaption for Windows.
Of course, one should be aware of the limitations of such simple testing.
Especially the lookup table methods (with large table) may look much more
attractive, than in a real program.
Things you might want to try: change the LBL_T in the source, try with and
without inlining, ...
Regards,
Dieter
tbitscan.c:
/* Speed test several bitscanning routines. Many
are taken from CCC. Therfore, they are also not coded
really consistently. They typically assume, that
they will not be called with zero. Not all are
totally portable. They may for example assume, that unsigned int
is a 32 bit type. To adapt to another system schould be
easy, however. Typically the functions are somewhat microoptimized
for a 32-bit environment.
Some routines will need to access the BITBOARD as 2 32-bit words. I used
w32h = bb>>32 and w32l=bb. This can be microoptimized again. Gcc has problems
with this. Using a union would solve this (but of course, not strictly
protable).
Various people seem to use FirsOne and LastOne just the other
way around. */
/* Define one of the function here, or preferrably
through compiler option -Dtestfunc=...*/
#ifndef testfunc
#define testfunc FindFirst_GCP_manual_mult
#endif
#ifndef NBB
#define NBB (100) /* Number of different bitboards */
#endif
#define NLOOPS (0xffffffffUL/64/NBB) /* To never let check result overflow */
#ifdef __GNUC__ /* Probably works also for Intel CC under Linux */
#define MYINLINE static __inline__
typedef unsigned long long BITBOARD;
#define HAVE_GCC_X86_INLINE_ASSEMBLY 1 /* redefine to zero on other platforms */
#else /* assume MSVC like compiler, works perhaps also with ICC */
#define MYINLINE static __forceinline
typedef unsigned __int64 BITBOARD;
#define HAVE_MSVC_X86_INLINE_ASSEMBLY 1
#endif
#define BITSCAN_INLINE MYINLINE
typedef unsigned char LTBL_T; /* Used for lookup tables */
typedef unsigned int UINT32;
#define str(x) #x
#define xstr(x) str(x)
#define tftext xstr(testfunc) /* Make testfunc a string */
static unsigned int (*checkfunc)(BITBOARD);
/* Speed does not matter, just for correctness test */
unsigned simple_last_one(BITBOARD b)
{
unsigned r=63;
if (b == 0)
return 0;
while ((b & ((BITBOARD)1 << r)) == 0)
r--;
return r;
}
unsigned simple_first_one(BITBOARD b)
{
unsigned r=0;
if (b == 0)
return 0;
while ((b & 1) == 0)
{
b >>= 1;
r++;
}
return r;
}
const LTBL_T LSB_64_table[154] =
{
#define __ 0
23,__,__,__,31,__,__,39,19,__, 17,16,18,__,47,10,20, 9, 8,11,
1, 0, 2,57,56,58, 3,12,__,59, __,__,21,__, 4,__,__,60,__,__,
__,__,__,13,__,__,__,__,__,__, 5,__,__,61,__,__,__,__,__,__,
__,__,__,__,22,__,__,__,30,__, __,38,__,__,__,14,__,__,46,__,
__,__, 6,__,__,62,__,__,__,54, __,__,__,__,__,__,__,__,__,__,
29,__,__,37,__,__,__,__,__,__, 45,__,__,__,__,__,28,__,__,36,
__,53,__,__,27,__,44,35,26,24, 25,34,32,33,43,40,41,52,42,15,
__,50,48,49,__,51, 7,__,__,63, __,__,__,55
#undef __
};
#define LSB_64_adj -51 // offset to table base
#define LSB_64_magic ( (UINT32)0x01C5FC81 ) // magic constant
BITSCAN_INLINE unsigned int WalterFaxonsMagicBitscan(BITBOARD bb)
{
BITBOARD t64;
UINT32 t32;
t64 = bb - 1;
bb &= t64; /* omit this line to retain current LSB */
t64 ^= bb;
t32 = (UINT32)t64 ^ (UINT32)(t64 >> 32);
t32 ^= LSB_64_magic;
t32 += t32 >> 16;
t32 -= t32 >> 8;
return LSB_64_table [LSB_64_adj + (unsigned char)t32];
}
const BITBOARD magic = 0x03f08c5392f756cdULL;
const unsigned int magicl = 0x92f756cd, magich=0x03f08c53;
const unsigned char magictable[64] = {
0, 1, 12, 2, 13, 22, 17, 3,
14, 33, 23, 36, 18, 58, 28, 4,
62, 15, 34, 26, 24, 48, 50, 37,
19, 55, 59, 52, 29, 44, 39, 5,
63, 11, 21, 16, 32, 35, 57, 27,
61, 25, 47, 49, 54, 51, 43, 38,
10, 20, 31, 56, 60, 46, 53, 42,
9, 30, 45, 41, 8, 40, 7, 6,
};
BITSCAN_INLINE unsigned int FindFirst_GCP(const BITBOARD b) {
const BITBOARD lsb = b & -b;
return magictable[(lsb * magic) >> 58];
}
BITSCAN_INLINE unsigned int FindFirst_GCP_manual_mult(BITBOARD b)
{
unsigned int bl, bh, r;
b &= -b;
bh = b >> 32;
bl = b;
r = ((bl*(BITBOARD)magicl)>>32) + bl*magich + bh*magicl;
return magictable[r>>26];
}
const LTBL_T lsz64_tbl[154] =
{
63,30, 3,32,59,14,11,33,
60,24,50, 9,55,19,21,34,
61,29, 2,53,51,23,41,18,
56,28, 1,43,46,27, 0,35,
62,31,58, 4, 5,49,54, 6,
15,52,12,40, 7,42,45,16,
25,57,48,13,10,39, 8,44,
20,47,38,22,17,37,36,26,
};
BITSCAN_INLINE unsigned int MattTaylorsLastOne(BITBOARD bb)
{
BITBOARD lsb = bb ^ (bb - 1);
#if 0
bb &= ~lsb; /* omit this line to retain current LSB */
#endif
unsigned int foldedLSB = ((int) lsb) ^ ((int)(lsb>>32));
return lsz64_tbl[foldedLSB * 0x78291ACF >> 26];
}
#if HAVE_MSVC_X86_INLINE_ASSEMBLY
/* From Gerd */
BITSCAN_INLINE unsigned int asmdirty_FirstOne(BITBOARD bb)
{
__asm
{
bsf eax, [bb+4]
xor eax, 32
bsf eax, [bb]
}
}
BITSCAN_INLINE unsigned int asmdirty_LastOne(BITBOARD bb)
{
__asm
{
xor edx, edx
bsr eax, [bb]
bsr eax, [bb+4]
setnz dl
shl edx, 5
add eax, edx
}
}
#endif
#if HAVE_GCC_X86_INLINE_ASSEMBLY
BITSCAN_INLINE unsigned int asmdirty_FirstOne(BITBOARD a)
{
unsigned int res;
__asm__ __volatile__
("bsfl %2, %0 \n"
"addl $32, %0 \n"
"bsfl %1, %0 \n"
#if __ICC
: "=r" (res)
#else
: "=r&" (res)
#endif
: "rm"((unsigned int)a), "rm" ((unsigned int)(a>>32))
: "cc");
return res;
}
#if 1
BITSCAN_INLINE unsigned int asmdirty_LastOne(BITBOARD a)
{
unsigned int res, tmp;
__asm__ __volatile__
("xorl %1, %1 \n"
"bsrl %2, %0 \n"
"bsrl %3, %0 \n"
"setnz %b1 \n"
"shll $5, %1 \n"
"addl %1, %0\n"
#if __ICC
: "=r" (res), "=q" (tmp)
#else
: "=r&" (res), "=q&" (tmp)
#endif
: "rm"((unsigned int)a), "rm" ((unsigned int)(a>>32))
: "cc");
return res;
}
#else
/* Shown by Bob Hyatt in CCC, attributed to Eugene Nalimov
cmpl $1, 8(%esp)
sbbl %eax, %eax
movl 8(%esp,%eax,4), %edx
bsr %edx, %ecx
jz l4
andl $32, %eax
subl $31, %ecx
subl %ecx, %eax
ret
l4: movl $64, %eax
ret
*/
/* Gcc seems to misoptimize this, depending on specific conditions
when called. Works fine with Icc, but slower than above */
BITSCAN_INLINE unsigned int asmdirty_LastOne(BITBOARD a)
{
unsigned int t, r;
__asm__ __volatile__(
"cmpl $1, 4(%2) \n"
"sbbl %0, %0 \n"
"movl 4(%2,%0,4), %1 \n"
"bsrl %1, %1 \n"
"notl %0 \n"
"andl $32, %0 \n"
"addl %1, %0 \n"
#if __ICC
: "=r" (r), "=r" (t)
#else
: "=r&" (r), "=r" (t)
#endif
: "r" (&a)
: "cc"
);
return r;
}
#endif
#endif
LTBL_T lo_ltbl[0x10000];
BITSCAN_INLINE unsigned int ltbl8_LastOne(BITBOARD arg)
{
unsigned int result = 0, t;
if (arg > 0xFFFFFFFF)
{
arg >>= 32;
result = 32;
}
t = arg;
if (t > 0xFFFF)
{
t >>= 16;
result += 16;
}
if (t > 0xFF)
{
t >>= 8;
result += 8;
}
return result + lo_ltbl[t];
}
BITSCAN_INLINE unsigned int ltbl16_LastOne(BITBOARD arg)
{
unsigned int result = 0, t;
if (arg > 0xFFFFFFFF)
{
arg >>= 32;
result = 32;
}
t = arg;
if (t > 0xFFFF)
{
t >>= 16;
result += 16;
}
return result + lo_ltbl[t];
}
BITSCAN_INLINE unsigned int log2_LastOne(BITBOARD arg)
{
unsigned int r = 0, t;
if (arg > 0xFFFFFFFF)
{
arg >>= 32;
r = 32;
}
t = arg;
if (t & 0xFFFF0000) { t &= 0xFFFF0000; r+=16; }
if (t & 0xFF00FF00) { t &= 0xFF00FF00; r+=8; }
if (t & 0xF0F0F0F0) { t &= 0xF0F0F0F0; r+=4; }
if (t & 0xCCCCCCCC) { t &= 0xCCCCCCCC; r+=2; }
if (t & 0xAAAAAAAA) r+=1;
return r;
}
/* Walter Faxon's implementation of Matt Taylor's routine */
/* ---------------------------------------------------------------------------
Should we use a union to force better alignment of this table? */
static const LTBL_T MT32table [32] = {
0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 };
#define MT32magic (130329821UL)
/* Return index of least-significant bit of 64; argument must be non-zero.
"Magic" indexing algorithm by Matt Taylor.
Version for 32-bit architecture by WF.
No warranty.
*/
BITSCAN_INLINE unsigned int leastSigBit64( BITBOARD bb )
{
unsigned long bb0, bb1;
unsigned int bbHalf;
bb1 = bb>>32;
bb0 = bb;
bbHalf = (bb0 == 0);
if (bbHalf) bb0 = bb1; /* will code as cmov (ideally) */
/* buers: icc and gcc can indeed do it, but does not seem to
help much */
bb0 ^= bb0 - 1;
bb0 *= MT32magic;
return (bbHalf << 5) | MT32table[bb0 >> 27];
/* if bbHalf in byte-addressable register, bitwise-or
preferred to avoid int+char type/sizing conflict */
}
/* Test the different functions */
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#define PRNG rand
#define MY_RAND_MAX RAND_MAX
unsigned long rand_range(unsigned long range)
{
unsigned long rmax, r, d;
/* find the largest number rmax <= MY_RAND_MAX, for which
(rmax+1) % range == 0.
All returns from rand() > rmax will be skipped, to guarantee
equal probability for all return values. */
d = (MY_RAND_MAX+1U-range) / range + 1;
rmax = d * range - 1; /* -1 to avoid "overflow to zero" */
do
r = PRNG();
while (r > rmax);
return r/d;
}
BITBOARD bbs[NBB];
/* Define functions in "wrong" order, to not let the compiler
do stupid inlining, that can make the code slower */
unsigned long foo(void);
unsigned long bar(void);
static void show_errors(void)
{
unsigned r1, r2;
int i, j;
for (i=0; i<NBB; i++)
{
r1 = checkfunc(bbs[i]);
r2 = testfunc(bbs[i]);
if (r1 != r2)
{
for (j=63; j>=0; j--)
printf("%d", (int)((bbs[i]>>j)&1));
printf(":r1=%2u,r2=%2u\n", r1, r2);
}
}
}
int main(void)
{
int i, j, nbits,nl=0,k;
unsigned long expect, result, b;
clock_t c, sum=0;
double t;
printf("Testing %s %s\n", xstr(BITSCAN_INLINE), tftext);
/* Initialize the lookup table, use one table for the different functions */
for (b=0; b < sizeof lo_ltbl/sizeof lo_ltbl[0]; b++)
lo_ltbl[b] = simple_last_one(b);
if (testfunc(3)==1)
checkfunc = simple_last_one;
else
checkfunc = simple_first_one;
for (nbits = 1; nbits <= 128; nbits = nbits < 8 ? nbits+1 : nbits*2)
{
expect = 0;
/* Set some random bitboards */
for (i=0; i<NBB; i++)
{
bbs[i] = 0;
for (j=0; j<nbits; j++)
bbs[i] |= (BITBOARD)1 << rand_range(64);
expect += checkfunc(bbs[i]);
}
expect *= NLOOPS;
c = clock();
for (k=0;k<10;k++)
result = bar();
c = clock()-c;
sum += c;
printf("%3d: %7.3f s", nbits, (double)c/CLOCKS_PER_SEC);
if (expect != result)
{
printf(" ERROR: expected %lu, result %lu", expect, result);
show_errors();
}
printf("\n");
nl++;
}
t = (double)sum / CLOCKS_PER_SEC;
printf("sum: %7.3f s for %.0f calls, %.3f ns/call\n",
t, (double)nl*NLOOPS*NBB,
t*1e9/((double)nl*NLOOPS*NBB));
return 0;
}
unsigned long bar(void)
{
long i;
unsigned long n=0;
for (i=0; i<NLOOPS; i++)
n += foo();
return n;
}
/* May inline testfunc here */
unsigned long foo(void)
{
int i;
unsigned long n=0;
for (i=0; i<NBB; i++)
n += testfunc(bbs[i]);
return n;
}
And the shell script:
#!/bin/sh
# Dirty hack. Does not check for errors (for example
# when the program does not compile)
#
# define one CC line with your favorite compiler and options
# and one summary line.
#CC="icc -o tbitscan"
#summary=icc.sum
#CC="gcc -O2 -fomit-frame-pointer -o tbitscan"
#summary=gcc.sum
CC="cl -Ox2 -Ob1 -G7"
summary=msvc.sum
for func in \
ltbl8_LastOne \
ltbl16_LastOne \
log2_LastOne \
FindFirst_GCP \
FindFirst_GCP_manual_mult \
MattTaylorsLastOne \
leastSigBit64 \
WalterFaxonsMagicBitscan \
asmdirty_FirstOne \
asmdirty_LastOne
do
$CC -Dtestfunc=$func tbitscan.c
./tbitscan | tee tmp.out
grep Testing tmp.out >> $summary
grep sum tmp.out >> $summary
done
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.