Author: Marcus Heidkamp
Date: 23:42:28 05/13/01
Go up one level in this thread
On May 12, 2001 at 00:47:36, Dann Corbit wrote:
>For machines with MMX, this is definitely faster than any other method no matter
>how many bits are populated (1 to 32) on a 64 bit bitboard.
>
>#include "amd3dx.h"
>
>/* Athlon MMX optimized popcount function, based on AMD's Athlon
> optimization manuals */
>
>unsigned PopulationCount (unsigned __int64 v)
>{
> static const __int64 C55 = 0x5555555555555555;
> static const __int64 C33 = 0x3333333333333333;
> static const __int64 C0F = 0x0F0F0F0F0F0F0F0F;
>
> __asm {
> movd mm0, word ptr v;
> punpckldq mm0, word ptr v + 4;
> movq mm1,mm0;
> psrld mm0,1;
> pand mm0,[C55];
> psubd mm1,mm0;
> movq mm0,mm1;
> psrld mm1,2;
> pand mm0,[C33];
> pand mm1,[C33];
> paddd mm0,mm1;
> movq mm1,mm0;
> psrld mm0,4;
> paddd mm0,mm1;
> pand mm0,[C0F];
> pxor mm1,mm1;
> psadbw (mm0,mm1);
> movd eax,mm0;
> emms;
> }
>}
>
>
>If you lack MMX, you might look at this as an alternative:
>unsigned popcount32(unsigned int input)
>{
>__asm {
> mov esi, input
> mov eax,esi
> shr eax,1
> and eax,055555555h ; (n >> 1) & 0x55555555
> sub esi,eax ; n - ((n >> 1) & 0x55555555)
> mov eax,esi
> shr eax,2 ; n >> 2
> and esi,033333333h ; n & 0x33333333
> and eax,033333333h ; (n >> 2) & 0x33333333
> add esi,eax ; n = (n & 0x33333333) + ((n >> 2) &
>0x33333333)
> mov eax,esi
> shr eax,4 ; n >> 4
> add eax,esi ; n + (n >> 4)
> and eax,00F0F0F0Fh ; n = (n + (n >> 4) & 0x0F0F0F0F)
> mov esi,eax
> shr esi,8 ; n >> 8
> add eax,esi ; n = n + (n >> 8)
> mov esi,eax
> shr esi,16 ; n >> 16
> add eax,esi ; n = n + (n >> 16)
> and eax,00000003Fh ; popcount
> }
>}
>
>typedef union {
> unsigned __int64 i64;
> unsigned i32[2];
>} bb;
>
>unsigned pc(bb b)
>{
> return (popcount32(b.i32[0]) + popcount32(b.i32[1]));
>}
>
>For my machine (AMD Athlon) it is definitely faster than the bsr/bsf routines
>found in Crafty, Amy, etc.
Just one remark about your "MMX" code:
If you would omit the last emms statement you can save additional penalties, if
MMX instructions are used again, e.g. in a loop. Only try to use emms if the
code collides with floating point arithmetic. I found, that this _never_ happens
in my chess program. So I only call it, just to be safe, after move generation
and after I finished calculating attacks, etc.
Marcus
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.