Computer Chess Club Archives


Search

Terms

Messages

Subject: The fastest popcount

Author: Dann Corbit

Date: 21:47:36 05/11/01


For machines with MMX, this is definitely faster than any other method no matter
how many bits are populated (1 to 32) on a 64 bit bitboard.

#include "amd3dx.h"

/* Athlon MMX optimized popcount function, based on AMD's Athlon
   optimization manuals */

unsigned PopulationCount (unsigned __int64 v)
{
 static const __int64 C55 = 0x5555555555555555;
 static const __int64 C33 = 0x3333333333333333;
 static const __int64 C0F = 0x0F0F0F0F0F0F0F0F;

 __asm {
  movd mm0, word ptr v;
  punpckldq mm0, word ptr v +  4;
  movq mm1,mm0;
  psrld mm0,1;
  pand mm0,[C55];
  psubd mm1,mm0;
  movq mm0,mm1;
  psrld mm1,2;
  pand mm0,[C33];
  pand mm1,[C33];
  paddd mm0,mm1;
  movq mm1,mm0;
  psrld mm0,4;
  paddd mm0,mm1;
  pand mm0,[C0F];
  pxor mm1,mm1;
  psadbw (mm0,mm1);
  movd eax,mm0;
  emms;
 }
}


If you lack MMX, you might look at this as an alternative:
unsigned popcount32(unsigned int input)
{
__asm {
        mov     esi, input
        mov     eax,esi
        shr     eax,1
        and     eax,055555555h           ;  (n >> 1) & 0x55555555
        sub     esi,eax                  ;  n - ((n >> 1) & 0x55555555)
        mov     eax,esi
        shr     eax,2                    ;  n >> 2
        and     esi,033333333h           ;  n & 0x33333333
        and     eax,033333333h           ;  (n >> 2)  & 0x33333333
        add     esi,eax                  ;  n = (n & 0x33333333) + ((n >> 2) &
0x33333333)
        mov     eax,esi
        shr     eax,4                    ;  n >> 4
        add     eax,esi                  ;  n + (n >> 4)
        and     eax,00F0F0F0Fh           ;  n = (n + (n >> 4) & 0x0F0F0F0F)
        mov     esi,eax
        shr     esi,8                    ;  n >> 8
        add     eax,esi                  ;  n = n + (n >> 8)
        mov     esi,eax
        shr     esi,16                   ;  n >> 16
        add     eax,esi                  ;  n = n + (n >> 16)
        and     eax,00000003Fh           ;  popcount
      }
}

typedef union {
    unsigned __int64 i64;
    unsigned        i32[2];
}               bb;

unsigned        pc(bb b)
{
    return (popcount32(b.i32[0]) + popcount32(b.i32[1]));
}

For my machine (AMD Athlon) it is definitely faster than the bsr/bsf routines
found in Crafty, Amy, etc.




This page took 0 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.