Computer Chess Club Archives


Search

Terms

Messages

Subject: Re: The fastest popcount

Author: Dann Corbit

Date: 12:18:12 05/14/01

Go up one level in this thread


On May 12, 2001 at 01:31:20, Robert Hyatt wrote:
[snip]
>We are comparing apples and oranges?  BSR/BSF are not popcnt instructions in
>Crafty.  They are to handle FirstOne()/LastOne() operations.  PopCnt() is
>a different function altogether.  I don't think your code will outrun mine
>for 0 1 or 2 bits set, which is the major case in the places I use it.  But
>my PopCnt() doesn't have any bsf/bsr instructions anyway so maybe we are
>talking different things..

As GCP stated, I misspoke.  Here is the code I actually benchmarked:


#include "amd3dx.h"

/* Athlon MMX optimized popcount function, based on AMD's Athlon
   optimization manuals */

unsigned DensePopulationCount (unsigned __int64 v)
{
 static const __int64 C55 = 0x5555555555555555;
 static const __int64 C33 = 0x3333333333333333;
 static const __int64 C0F = 0x0F0F0F0F0F0F0F0F;

 __asm {
  movd mm0, word ptr v;
  punpckldq mm0, word ptr v +  4;
  movq mm1,mm0;
  psrld mm0,1;
  pand mm0,[C55];
  psubd mm1,mm0;
  movq mm0,mm1;
  psrld mm1,2;
  pand mm0,[C33];
  pand mm1,[C33];
  paddd mm0,mm1;
  movq mm1,mm0;
  psrld mm0,4;
  paddd mm0,mm1;
  pand mm0,[C0F];
  pxor mm1,mm1;
  psadbw (mm0,mm1);
  movd eax,mm0;
  emms;
 }
}


/* From Crafty, this is for sparsely populated bitmaps */
unsigned SparsePopulationCount (unsigned __int64 a) {
  __asm {
        mov     ecx, dword ptr a
        xor     eax, eax
        test    ecx, ecx
        jz      l1
    l0: lea     edx, [ecx-1]
        inc     eax
        and     ecx, edx
        jnz     l0
    l1: mov     ecx, dword ptr a+4
        test    ecx, ecx
        jz      l3
    l2: lea     edx, [ecx-1]
        inc     eax
        and     ecx, edx
        jnz     l2
    l3:
  }
}

unsigned FirstBit(unsigned __int64 a) {
  __asm {
        bsr     edx, dword ptr a+4
        mov     eax, 31
        jnz     l1
        bsr     edx, dword ptr a
        mov     eax, 63
        jnz     l1
        mov     edx, -1
  l1:   sub     eax, edx
  }
}

unsigned LastBit(unsigned __int64 a) {
  __asm {
        bsf     edx, dword ptr a
        mov     eax, 63
        jnz     l1
        bsf     edx, dword ptr a+4
        mov     eax, 31
        jnz     l1
        mov     edx, -33
  l1:   sub     eax, edx
  }
}


/*
 * Find number of nonzero bits in a unsigned __int64.
 * Defined by Ernst A. Heinz, University of Karlsruhe.
 */
unsigned int CPopulationCount(unsigned __int64 b)
  {
    unsigned int n;

    for (n = 0; b != 0; n++, b &= (b-1));
        return n;
  }

/*
This one sometimes gives the WRONG ANSWER!!!
Too bad.  For some problem sizes, it is fast as
the burning blue blazes.
#define mm1 ((unsigned __int64) 0x5555555555555555)
#define mm2 ((unsigned __int64) 0x3333333333333333)

unsigned int NCPopulationCount(unsigned __int64 b)
{
    unsigned int n;
    const unsigned __int64 a = b - ((b >> 1) & mm1);
    const unsigned __int64 c = (a & mm2) + ((a >> 2) & mm2);
    n = ((unsigned int)c) + ((unsigned int)(c >> 32));
    n = (n & 0x0F0F0F0F) + ((n >> 4) & 0x0F0F0F);
    n = (n & 0xFFFF) + (n >> 16);
    n = (n & 0xFF) + (n >> 8);
    return n;
}

*/
/*
;  population count of a 32 bit register
;
;  input:
;    esi = number to population count
;
;  output:
;    eax = population count
;
;  destroys:
;   esi, eax
;   eflags
*/
unsigned popcount32fm(unsigned int input)
{
__asm {
        mov     esi, input
        mov     eax,esi
        shr     eax,1
        and     eax,055555555h           ;  (n >> 1) & 0x55555555
        sub     esi,eax                  ;  n - ((n >> 1) & 0x55555555)
        mov     eax,esi
        shr     eax,2                    ;  n >> 2
        and     esi,033333333h           ;  n & 0x33333333
        and     eax,033333333h           ;  (n >> 2)  & 0x33333333
        add     esi,eax                  ;  n = (n & 0x33333333) + ((n >> 2) &
0x33333333)
        mov     eax,esi
        shr     eax,4                    ;  n >> 4
        add     eax,esi                  ;  n + (n >> 4)
        and     eax,00F0F0F0Fh           ;  n = (n + (n >> 4) & 0x0F0F0F0F)
        imul    eax,001010101h           ;  add by multiplying with a "magic
number"
        shr     eax,24                   ;  shift result into place
      }
}

unsigned popcount32(unsigned int input)
{
__asm {
        mov     esi, input
        mov     eax,esi
        shr     eax,1
        and     eax,055555555h           ;  (n >> 1) & 0x55555555
        sub     esi,eax                  ;  n - ((n >> 1) & 0x55555555)
        mov     eax,esi
        shr     eax,2                    ;  n >> 2
        and     esi,033333333h           ;  n & 0x33333333
        and     eax,033333333h           ;  (n >> 2)  & 0x33333333
        add     esi,eax                  ;  n = (n & 0x33333333) + ((n >> 2) &
0x33333333)
        mov     eax,esi
        shr     eax,4                    ;  n >> 4
        add     eax,esi                  ;  n + (n >> 4)
        and     eax,00F0F0F0Fh           ;  n = (n + (n >> 4) & 0x0F0F0F0F)
        mov     esi,eax
        shr     esi,8                    ;  n >> 8
        add     eax,esi                  ;  n = n + (n >> 8)
        mov     esi,eax
        shr     esi,16                   ;  n >> 16
        add     eax,esi                  ;  n = n + (n >> 16)
        and     eax,00000003Fh           ;  popcount
      }
}

typedef union {
    unsigned __int64 i64;
    unsigned        i32[2];
}               bb;

unsigned        pc(bb b)
{
    return (popcount32fm(b.i32[0]) + popcount32fm(b.i32[1]));
}

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>

int             rand_range(int N)
{
    return (int) ((double) rand() / ((double) RAND_MAX + 1) * N);
}

char            string[32767];

#define BITMASK(b) (1 << ((b) % CHAR_BIT))
#define BITSLOT(b) ((b) / CHAR_BIT)
#define BITSET(a, b) ((a)[BITSLOT(b)] |= BITMASK(b))

int             main(void)
{
    int             i, j;
    int             mbc;
    int             ntr;
    int             c1,
                    c2,
                    c3,
                    c4,
                    c5,
                    c6,
                    c7;
    bb              a;
    unsigned char  *bar = (unsigned char *) &a.i64;

    puts("Please enter the maximum number of bits:");
    if (fgets(string, sizeof string, stdin))
        mbc = atoi(string);
    mbc = max(mbc, 1);
    mbc = min(mbc, 32);
    puts("Please enter the maximum number of trials:");

    if (fgets(string, sizeof string, stdin))
        ntr = atoi(string);
    for (i = 0; i < ntr; i++) {
        a.i64=0;
        for (j = 0; j < mbc; j++)
            BITSET(bar, rand_range(63));


        c1 = DensePopulationCount(a.i64);
        c2 = SparsePopulationCount(a.i64);
        c3 = CPopulationCount(a.i64);
//        c4 = NCPopulationCount(a.i64);
        c5 = popcount32fm(a.i32[0]) + popcount32fm(a.i32[1]);
        c6 = popcount32(a.i32[0]) + popcount32(a.i32[1]);
//        c7 = pc(a);
        if (c1 != c2)
            printf("c1=%u, c2 = %u\n", c1, c2);
        if (c1 != c3)
            printf("c1=%u, c3 = %u\n", c1, c3);
//        if (c1 != c4)
//            printf("c1=%u, c4 = %u\n", c1, c4);
        if (c1 != c5)
            printf("c1=%u, c5 = %u\n", c1, c5);
        if (c1 != c6)
            printf("c1=%u, c6 = %u\n", c1, c6);
//        if (c1 != c7)
//            printf("c1=%u, c7 = %u\n", c1, c7);

    }


    return 0;
}

And here is the table of the results that I calculated:
BITS	CPopulationCount	DensePopulationCount	Popcount32	Popcount32fm	SparsePopulationCount
32	3.08	0.81	1.03	0.99	1.63
31
30
29
28
27
26	3.3	1.01	1.19	1.2	1.98
25
24
23	3.45	2.02	2.62	2.77	2.37
22
21
20
19
18	3.4	1.36	1.74	1.71	2.07
17
16
15
14
13	3.41	1.22	1.49	1.53	1.98
12
11	3.46	1.12	1.3	1.4	2.06
10
9
8	3.33	0.93	1.22	1.24	1.95
7	3.22	0.91	1.11	1.19	1.71
6	3.16	1.12	1.3	1.4	2.06
5	3.04	0.88	1.09	1.08	1.8
4	3.2	0.87	0.99	1.05	1.73
3	2.99	0.79	1.02	1.06	1.77
2	3.07	0.85	1.03	1.01	1.76

The gaps represent places were no measurements were taken.  I used the data to
draw graphs using Microsoft Excell, which is incredibly dense as to what x-axis
data might possibly mean (everything is a text label).

As you can see, DensePopulationCount wins on *my* machine for all problem sizes
up to the largest size that will occur on a real chess board.  I suspected it
would be fastest for dense bitsets, as you can see by my ill-chosen name).  At
any rate, this varies quite a bit from machine to machine, with some (clearly)
not even being able to run it -- even amongst intel-type architectures.

So, as always, YMMV.




This page took 0 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.