Author: Dann Corbit
Date: 21:47:36 05/11/01
For machines with MMX, this is definitely faster than any other method no matter
how many bits are populated (1 to 32) on a 64 bit bitboard.
#include "amd3dx.h"
/* Athlon MMX optimized popcount function, based on AMD's Athlon
optimization manuals */
unsigned PopulationCount (unsigned __int64 v)
{
static const __int64 C55 = 0x5555555555555555;
static const __int64 C33 = 0x3333333333333333;
static const __int64 C0F = 0x0F0F0F0F0F0F0F0F;
__asm {
movd mm0, word ptr v;
punpckldq mm0, word ptr v + 4;
movq mm1,mm0;
psrld mm0,1;
pand mm0,[C55];
psubd mm1,mm0;
movq mm0,mm1;
psrld mm1,2;
pand mm0,[C33];
pand mm1,[C33];
paddd mm0,mm1;
movq mm1,mm0;
psrld mm0,4;
paddd mm0,mm1;
pand mm0,[C0F];
pxor mm1,mm1;
psadbw (mm0,mm1);
movd eax,mm0;
emms;
}
}
If you lack MMX, you might look at this as an alternative:
unsigned popcount32(unsigned int input)
{
__asm {
mov esi, input
mov eax,esi
shr eax,1
and eax,055555555h ; (n >> 1) & 0x55555555
sub esi,eax ; n - ((n >> 1) & 0x55555555)
mov eax,esi
shr eax,2 ; n >> 2
and esi,033333333h ; n & 0x33333333
and eax,033333333h ; (n >> 2) & 0x33333333
add esi,eax ; n = (n & 0x33333333) + ((n >> 2) &
0x33333333)
mov eax,esi
shr eax,4 ; n >> 4
add eax,esi ; n + (n >> 4)
and eax,00F0F0F0Fh ; n = (n + (n >> 4) & 0x0F0F0F0F)
mov esi,eax
shr esi,8 ; n >> 8
add eax,esi ; n = n + (n >> 8)
mov esi,eax
shr esi,16 ; n >> 16
add eax,esi ; n = n + (n >> 16)
and eax,00000003Fh ; popcount
}
}
typedef union {
unsigned __int64 i64;
unsigned i32[2];
} bb;
unsigned pc(bb b)
{
return (popcount32(b.i32[0]) + popcount32(b.i32[1]));
}
For my machine (AMD Athlon) it is definitely faster than the bsr/bsf routines
found in Crafty, Amy, etc.
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.