Computer Chess Club Archives

Search

Terms
Messages

Subject: Re: PopCnt and Leadz

Author: Landon Rabern
Date: 19:57:15 12/03/00
Here is what I use for MSB and LSB in Betsy.  I just use a simple iterative
algorithm in C for popCount.  H1=0 A8=63, thats exactly opposite of what you
wanted, but I thought it might help.  I get about a 25% speedup using these over
my old lookup tables.

__forceinline int MSB(bitBoard n){
	__asm {
        bsr     edx, dword ptr n+4
        mov     eax, 32
        jnz     l1
        bsr     edx, dword ptr n
        mov     eax, 0
        jnz     l1
        mov     edx, -1
   q:   add     eax, edx
  }
}

__forceinline int LSB(bitBoard n){
	__asm {
	bsf     edx, dword ptr n
        mov     eax, 0
        jnz     l1
        bsf     edx, dword ptr n+4
        mov     eax, 32
        jnz     l1
        mov     edx, -33
   q:   add     eax, edx
  }
}
On December 03, 2000 at 08:53:34, David Rasmussen wrote:

>On December 02, 2000 at 11:09:03, Vincent Diepeveen wrote:
>
>>On December 02, 2000 at 07:23:16, David Rasmussen wrote:
>>
>>>Hi there.
>>>
>>>Someone, please post inline assembler functions for me to just cut and paste
>>>into my existing files for my chess program, for these functions, in VC++6.
>>>
>>>I don't bother to write them myself, I don't even bother to take the crafty
>>>stuff and put that in, in case I have to change anything. I'm just curious about
>>>the speed gain.
>>>
>>>P.S. My bitboard implementation has first bit (0) at A8 and last bit (63) at H1.
>>>
>>>Thanks in advance.
>>
>>Sorry to dissappoint you but PopCnt and Leadz are pretty slow
>>in their current crafty implementations, basically because
>>bitboards are for 64 bits machines and intel is 32 bits for a long
>>period of time still.
>>
>> vcinline.h:
>>
>>extern unsigned char  first_ones[65536];
>>extern unsigned char  last_ones[65536];
>>
>>#if _MSC_VER >= 1200
>>#define FORCEINLINE __forceinline
>>#else
>>#define FORCEINLINE __inline
>>#endif
>>
>>
>>FORCEINLINE int PopCnt(BITBOARD a) {
>>
>>/* Because Crafty bitboards are typically sparsely populated, we use a
>>   streamlined version of the boolean.c algorithm instead of the one in x86.s */
>>
>>  __asm {
>>        mov     ecx, dword ptr a
>>        xor     eax, eax
>>        test    ecx, ecx
>>        jz      l1
>>    l0: lea     edx, [ecx-1]
>>        inc     eax
>>        and     ecx, edx
>>        jnz     l0
>>    l1: mov     ecx, dword ptr a+4
>>        test    ecx, ecx
>>        jz      l3
>>    l2: lea     edx, [ecx-1]
>>        inc     eax
>>        and     ecx, edx
>>        jnz     l2
>>    l3:
>>  }
>>}
>>
>>
>>FORCEINLINE int FirstOne(BITBOARD a) {
>>
>>#if _M_IX86 <= 500 /* on plain Pentiums, use boolean.c algorithm */
>>  __asm {
>>        movzx   edx, word ptr a+6
>>        xor     eax, eax
>>        test    edx, edx
>>        jnz     l1
>>        mov     dx, word ptr a+4
>>        mov     eax, 16
>>        test    edx, edx
>>        jnz     l1
>>        mov     dx, word ptr a+2
>>        mov     eax, 32
>>        test    edx, edx
>>        jnz     l1
>>        mov     dx, word ptr a
>>        mov     eax, 48
>>  l1:   add     al, byte ptr first_ones[edx]
>>  }
>>#else /* BSF and BSR are *fast* instructions on PPro/PII */
>>  __asm {
>>        bsr     edx, dword ptr a+4
>>        mov     eax, 31
>>        jnz     l1
>>        bsr     edx, dword ptr a
>>        mov     eax, 63
>>        jnz     l1
>>        mov     edx, -1
>>  l1:   sub     eax, edx
>>  }
>>#endif /* _M_IX86 > 500 */
>>}
>>
>>FORCEINLINE int LastOne(BITBOARD a) {
>>
>>#if _M_IX86 <= 500 /* on plain Pentiums, use boolean.c algorithm */
>>  __asm {
>>        movzx   edx, word ptr a
>>        mov     eax, 48
>>        test    edx, edx
>>        jnz     l1
>>        mov     dx, word ptr a+2
>>        mov     eax, 32
>>        test    edx, edx
>>        jnz     l1
>>        mov     dx, word ptr a+4
>>        mov     eax, 16
>>        test    edx, edx
>>        jnz     l1
>>        mov     dx, word ptr a+6
>>        xor     eax, eax
>>        test    edx, edx
>>        jnz     l1
>>        mov     eax, 48
>>l1:     add     al, byte ptr last_ones[edx]
>>  }
>>#else /* BSF and BSR are *fast* instructions on PPro/PII */
>>  __asm {
>>        bsf     edx, dword ptr a
>>        mov     eax, 63
>>        jnz     l1
>>        bsf     edx, dword ptr a+4
>>        mov     eax, 31
>>        jnz     l1
>>        mov     edx, -33
>>  l1:   sub     eax, edx
>>  }
>>#endif /* _M_IX386 > 500 */
>>}
>
>Not to be unthankful, but I believe that that implementation assumes A1=0, H8=63
This page took 0.01 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.