Computer Chess Club Archives


Search

Terms

Messages

Subject: Re: BitScan with reset - not so impressive with 3DNow!

Author: Walter Faxon

Date: 15:49:10 12/02/02

Go up one level in this thread


On December 02, 2002 at 16:48:52, Gerd Isenberg wrote:

>May be i found not the best way to to b&(-b) with mmx, but building the 64-bit
>two's complement with mmx-dword is not so nice. First you have to do the one's
>complement by pxor -1, then comparing low dword with -1 and building an
>conditional overflow, adding 00:01 or 01:01...
>So using 32-bit registers was the fastest so far, but that may require some
>additional push/pop.
>
>Some times mesured in seconds with this dumb loop (nothing inlined):
>K7XP2.1+ ~1.8GHz
>
>10-bit pattern		bsf	PI2FD	btr	c
>0x0000000011111133	15.3	18.0	19.1	22.8
>0x1010111010101110	19.7	18.5	19.6	23.4
>0x1111113300000000	20.6	18.0	19.1	22.8
>
>inlined are ~5 seconds faster
>
>Cheers,
>Gerd
>
>-----------------------------------------------------------------------------
>int main()
>{
>	DWORD start = GetTickCount();
>	for (int i=0; i < 100000000; i++) // 10^8
>	{
>		BitBoard bb = 0x1010111010101110; // 10 bits set
>		while (bb)
>			bitSearchAndReset_PI2FD(bb); // 10^9 runs in total
>	}
>	DWORD stop = GetTickCount();
>	printf("Time in seconds: %d.%03d\n", (stop-start)/1000, (stop-start)%1000 );
>}
>
>-----------------------------------------------------------------------------
>
>int bitSearchAndReset_bsf(BitBoard &bb)
>{
>	__asm
>	{
>		xor		edx, edx
>		mov		esi, [bb]
>		xor		eax, eax
>		inc		edx
>		bsf		ecx, [esi]
>		jnz		found
>		bsf		ecx, [esi + 4]
>		lea		esi, [esi + 4]
>		xor		eax, 32
>	found:
>		shl		edx, cl
>		xor		eax, ecx
>		xor		[esi], edx
>	}
>}
>
>-----------------------------------------------------------------------------
>
>int bitSearchAndReset_PI2FD(BitBoard &bb)
>{
>	__asm
>	{
>		mov	ebx, [bb]	; get the reference (like a pointer)
>		pxor	mm1, mm1	; 0, to get the dword mask
>
>		mov	edx, [ebx]	; get bb
>		mov	esi, [ebx+4]; bb -> esi:edx
>
>		mov	ecx, edx
>		mov	eax, esi	; bb -> eax:esi
>
>		pcmpeqd	mm6, mm6	; -1 to complement the dword mask
>		pxor	mm7, mm7	; 0, to add both final dwords
>
>		neg	ecx		; low -bb
>		adc	eax, 0		; consider borrow
>		and	ecx, edx	; low (bb & -bb)
>		neg	eax		; high -bb
>		movd	mm0, ecx	; low (bb & -bb)
>		and	eax, esi	; high (bb & -bb)
>		xor	edx, ecx	; reset low
>		movd	mm2, eax	; high (bb & -bb)
>		xor	esi, eax	; reset high
>		punpckldq mm0, mm2	; bb & -bb -> single bit in mm0
>
>		mov	[ebx],   edx	; write modified bb back
>		mov	[ebx+4], esi
>
>		pcmpeqd	mm1, mm0	; mask of the zero dword
>		PI2FD	mm0, mm0	; 3f8..,400..
>		pxor	mm1, mm6	; mask of the none zero dword
>		psrlq	mm6, 63		; 00:01
>		psrld	mm0, 23		; 3f8 to 7f
>		psrld	mm1, 25		; 7f mask
>		psllq	mm6, 32+5	; 20:00
>		psubd	mm0, mm1	; - 7f mask
>		por	mm0, mm6	; + 32 in high dword
>		pand	mm0, mm1	; & 7f mask
>		psadbw	mm0, mm7	; add all bytes
>		movd	eax, mm0
>	}
>}
>
>-----------------------------------------------------------------------------
>
>int bitSearchAndReset_btr(BitBoard &bb)
>{
>	__asm
>	{
>		mov	edx, [bb]
>		bsf	eax, [edx+4]
>		xor	eax, 32
>		bsf	eax, [edx]
>		btr	[edx],eax
>	}
>}
>
>-----------------------------------------------------------------------------
>
>int bitSearchAndReset_C(BitBoard &bb)
>{
>	BitBoard lsbb = bb & (-(__int64)bb);
>	bb ^= lsbb;
>	unsigned int lsb = LOWBOARD(lsbb) | HIGHBOARD(lsbb);
>	return ((((((((((HIGHBOARD(lsbb)!=0) <<1)
>		    ^((lsb & 0xffff0000)!=0))<<1)
>		    ^((lsb & 0xff00ff00)!=0))<<1)
>		    ^((lsb & 0xf0f0f0f0)!=0))<<1)
>		    ^((lsb & 0xcccccccc)!=0))<<1)
>		    ^((lsb & 0xaaaaaaaa)!=0);
>}
==============================================================================

Hi, Gerd.

As long as you're test-comparing bit-search-and-reset codes, I wonder if you
could please consider also comparing my C code version, posted on CCC with the
subject "Another hacky method for bitboard bit extraction" on November 17.  I
repeat it below.  You can of course make those changes required for proper
compilation and comparison in your setup.  Thanks!

-- Walter

Code follows:
// ---------------------------------------------------------------------------

typedef unsigned long long  u64;    // nonstandard
typedef unsigned long       u32;
typedef unsigned char       u8;

extern const u8 LSB_64_table[154];              // bit number table
#define LSB_64_adj  -51                         // offset to table base
#define LSB_64_magic  ( (u32)0x01C5FC81 )       // magic constant

// ---------------------------------------------------------------------------
// LSB_64() -- find, remove, report least-significant bit of 64.
// Argument 'bb' must be non-null.  Method:  fold then table lookup.
// Written by Walter Faxon, June 2002.  No copyright.  No warranty.
//
inline                  // inline declaration may differ by compiler
u8 LSB_64( u64* bb )
    {
    u64 t64;
    u32 t32;
    t64 = *bb - 1;
    *bb &= t64;         // omit this line to retain current LSB
    t64 ^= *bb;
    t32 = (u32)t64 ^ (u32)(t64 >> 32);
    t32 ^= LSB_64_magic;
    t32 += t32 >> 16;
    t32 -= t32 >> 8;
    return LSB_64_table [LSB_64_adj + (u8)t32];
    }

// ---------------------------------------------------------------------------
// Table reports number of low-order bit as 0, high-order as 63.
// (Numbering can be reversed by changing this table.)
// Important:  arrange storage so that this table is kept in the cache.
const u8 LSB_64_table[154] =
    {
#define __  0
    23,__,__,__,31,__,__,39,19,__, 17,16,18,__,47,10,20, 9, 8,11,
     1, 0, 2,57,56,58, 3,12,__,59, __,__,21,__, 4,__,__,60,__,__,
    __,__,__,13,__,__,__,__,__,__,  5,__,__,61,__,__,__,__,__,__,
    __,__,__,__,22,__,__,__,30,__, __,38,__,__,__,14,__,__,46,__,
    __,__, 6,__,__,62,__,__,__,54, __,__,__,__,__,__,__,__,__,__,
    29,__,__,37,__,__,__,__,__,__, 45,__,__,__,__,__,28,__,__,36,
    __,53,__,__,27,__,44,35,26,24, 25,34,32,33,43,40,41,52,42,15,
    __,50,48,49,__,51, 7,__,__,63, __,__,__,55
#undef __
    };

//eof

P.S.  You can even avoid the table lookup if you're willing to deal with
scrambled square indecies in the range 0 to 153.

P.P.S.  I feel a little like the unlucky scientist whose results were lost
because they were only published _four_ times... :)



This page took 0.06 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.