Author: Gerd Isenberg
Date: 12:05:56 12/16/02
Hi all,
My conclusion with Athlons bit scan (and reset) routines so far:
Inlined bsf is the fastest for IsiChess, because of shortest code!
Walter Faxon's magic LSB_64 and bitSearchAndReset_MMX (both faster not inlined)
both produce a about 3-5% slower IsiChess. LSB_64 and bitSearchAndReset_MMX are
equal in IsiChess. In a dumb-loop test bitSearchAndReset_MMX is clearly the
fastest (14.9 seconds) for 10^8 * 10 bits resets.
An Athlon-MMX/3DNow! routine, to traverse bitboards in mm0 like this is quite
nice. But inefficient with rarely populated or even more with zero bitboards:
while ( (sq = bitSearchAndReset_MM0()) < 64)
doSomethingWithSquareWithoutUsingMM0(sq);
The bitboard is not null test is too cheap with 32 (64) bit general purpose
registers, that i believe now, traversing or serializing botboards is a domain
of general purpose registers and sucks with mmx or sse(2) due to the lack of a
cheap is (not) zero jump.
Also any try to scan bits simultaniously sucks due to rarely populated bitboards
during capture generation in sevaral state-runs.
But anyway, the mmx-routines for athlons:
Cheers,
Gerd
// if bb == 0 the routine returns some value > 63
// in/out: mm0 (bb) with reset found bit
// changed: mm0..mm4
unsigned int bitSearchAndReset_MM0()
{
__asm
{
pxor mm1, mm1 ; 0
pcmpeqd mm2, mm2 ; -1
pcmpeqd mm3, mm3 ; -1
pcmpeqd mm1, mm0 ; -(hibb == 0) : -(lobb == 0)
movq mm4, mm0 ; bb
punpckldq mm3, mm1 ; -(lobb == 0) : -1
pxor mm2, mm1 ; -(hibb != 0) : -(lobb != 0)
punpckldq mm2, mm1 ; -(lobb == 0) : -(lobb != 0)
paddd mm0, mm3 ; +(-1)
pand mm0, mm4 ; bb & (bb-1) lsb is reset
psllq mm1, 63 ; ..*(lobb == 0)
pxor mm4, mm0 ; lsb
psrld mm2, 25 ; (bb == 0) ? 7f:00 ? 00:7f
PI2FD mm4, mm4 ; 3f8..,400..
psrlq mm1, 63-5 ; 32*(lobb == 0)
pslld mm4, 1 ; clear sign
psrld mm4, 24 ; 3f8 to 7f
psubd mm4, mm2 ; - (bb == 0) ? 7f:00 ? 00:7f
PSADBW mm4, mm1 ; sum of absolute byte differences
movd eax, mm4
}
}
unsigned int bitSearchAndReset_MMX(BitBoard &bb)
{
__asm
{
mov eax, [bb] ; get the reference (like a pointer)
movq mm0, [eax] ; bb should be 8-byte aligned
pxor mm1, mm1 ; 0
pcmpeqd mm2, mm2 ; -1
pcmpeqd mm3, mm3 ; -1
pcmpeqd mm1, mm0 ; -(hibb == 0) : -(lobb == 0)
movq mm4, mm0 ; bb
punpckldq mm3, mm1 ; -(lobb == 0) : -1
pxor mm2, mm1 ; -(hibb != 0) : -(lobb != 0)
punpckldq mm2, mm1 ; -(lobb == 0) : -(lobb != 0)
paddd mm0, mm3 ; +(-1)
pand mm0, mm4 ; bb & (bb-1) lsb is reset
psllq mm1, 63 ; ..*(lobb == 0)
pxor mm4, mm0 ; lsb
movq [eax], mm0 ; store with reset lsb
psrld mm2, 25 ; (bb == 0) ? 7f:00 ? 00:7f
PI2FD mm4, mm4 ; 3f8..,400..
psrlq mm1, 63-5 ; 32*(lobb == 0)
pslld mm4, 1 ; clear sign
psrld mm4, 24 ; 3f8 to 7f
psubd mm4, mm2 ; - (bb == 0) ? 7f:00 ? 00:7f
PSADBW mm4, mm1 ; sum of absolute byte differences
movd eax, mm4
}
}
The fastest in IsiChess:
__forceinline
unsigned int bitSearchAndReset(BitBoard &bb) // precondition bb != 0
{
__asm
{
xor edx, edx
mov ebx, [bb]
xor eax, eax
inc edx
bsf ecx, [ebx]
jnz found
bsf ecx, [ebx+4]
lea ebx, [ebx+4]
xor eax, 32
found:
shl edx, cl
xor eax, ecx
xor [ebx], edx
}
}
This page took 0.01 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.