Computer Chess Club Archives

Search

Terms
Messages

Subject: Re: The fastest popcount

Author: Marcus Heidkamp
Date: 23:42:28 05/13/01
On May 12, 2001 at 00:47:36, Dann Corbit wrote:

>For machines with MMX, this is definitely faster than any other method no matter
>how many bits are populated (1 to 32) on a 64 bit bitboard.
>
>#include "amd3dx.h"
>
>/* Athlon MMX optimized popcount function, based on AMD's Athlon
>   optimization manuals */
>
>unsigned PopulationCount (unsigned __int64 v)
>{
> static const __int64 C55 = 0x5555555555555555;
> static const __int64 C33 = 0x3333333333333333;
> static const __int64 C0F = 0x0F0F0F0F0F0F0F0F;
>
> __asm {
>  movd mm0, word ptr v;
>  punpckldq mm0, word ptr v +  4;
>  movq mm1,mm0;
>  psrld mm0,1;
>  pand mm0,[C55];
>  psubd mm1,mm0;
>  movq mm0,mm1;
>  psrld mm1,2;
>  pand mm0,[C33];
>  pand mm1,[C33];
>  paddd mm0,mm1;
>  movq mm1,mm0;
>  psrld mm0,4;
>  paddd mm0,mm1;
>  pand mm0,[C0F];
>  pxor mm1,mm1;
>  psadbw (mm0,mm1);
>  movd eax,mm0;
>  emms;
> }
>}
>
>
>If you lack MMX, you might look at this as an alternative:
>unsigned popcount32(unsigned int input)
>{
>__asm {
>        mov     esi, input
>        mov     eax,esi
>        shr     eax,1
>        and     eax,055555555h           ;  (n >> 1) & 0x55555555
>        sub     esi,eax                  ;  n - ((n >> 1) & 0x55555555)
>        mov     eax,esi
>        shr     eax,2                    ;  n >> 2
>        and     esi,033333333h           ;  n & 0x33333333
>        and     eax,033333333h           ;  (n >> 2)  & 0x33333333
>        add     esi,eax                  ;  n = (n & 0x33333333) + ((n >> 2) &
>0x33333333)
>        mov     eax,esi
>        shr     eax,4                    ;  n >> 4
>        add     eax,esi                  ;  n + (n >> 4)
>        and     eax,00F0F0F0Fh           ;  n = (n + (n >> 4) & 0x0F0F0F0F)
>        mov     esi,eax
>        shr     esi,8                    ;  n >> 8
>        add     eax,esi                  ;  n = n + (n >> 8)
>        mov     esi,eax
>        shr     esi,16                   ;  n >> 16
>        add     eax,esi                  ;  n = n + (n >> 16)
>        and     eax,00000003Fh           ;  popcount
>      }
>}
>
>typedef union {
>    unsigned __int64 i64;
>    unsigned        i32[2];
>}               bb;
>
>unsigned        pc(bb b)
>{
>    return (popcount32(b.i32[0]) + popcount32(b.i32[1]));
>}
>
>For my machine (AMD Athlon) it is definitely faster than the bsr/bsf routines
>found in Crafty, Amy, etc.

Just one remark about your "MMX" code:
If you would omit the last emms statement you can save additional penalties, if
MMX instructions are used again, e.g. in a loop. Only try to use emms if the
code collides with floating point arithmetic. I found, that this _never_ happens
in my chess program. So I only call it, just to be safe, after move generation
and after I finished calculating attacks, etc.

Marcus
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.