Author: Dann Corbit
Date: 12:18:12 05/14/01
Go up one level in this thread
On May 12, 2001 at 01:31:20, Robert Hyatt wrote:
[snip]
>We are comparing apples and oranges? BSR/BSF are not popcnt instructions in
>Crafty. They are to handle FirstOne()/LastOne() operations. PopCnt() is
>a different function altogether. I don't think your code will outrun mine
>for 0 1 or 2 bits set, which is the major case in the places I use it. But
>my PopCnt() doesn't have any bsf/bsr instructions anyway so maybe we are
>talking different things..
As GCP stated, I misspoke. Here is the code I actually benchmarked:
#include "amd3dx.h"
/* Athlon MMX optimized popcount function, based on AMD's Athlon
optimization manuals */
unsigned DensePopulationCount (unsigned __int64 v)
{
static const __int64 C55 = 0x5555555555555555;
static const __int64 C33 = 0x3333333333333333;
static const __int64 C0F = 0x0F0F0F0F0F0F0F0F;
__asm {
movd mm0, word ptr v;
punpckldq mm0, word ptr v + 4;
movq mm1,mm0;
psrld mm0,1;
pand mm0,[C55];
psubd mm1,mm0;
movq mm0,mm1;
psrld mm1,2;
pand mm0,[C33];
pand mm1,[C33];
paddd mm0,mm1;
movq mm1,mm0;
psrld mm0,4;
paddd mm0,mm1;
pand mm0,[C0F];
pxor mm1,mm1;
psadbw (mm0,mm1);
movd eax,mm0;
emms;
}
}
/* From Crafty, this is for sparsely populated bitmaps */
unsigned SparsePopulationCount (unsigned __int64 a) {
__asm {
mov ecx, dword ptr a
xor eax, eax
test ecx, ecx
jz l1
l0: lea edx, [ecx-1]
inc eax
and ecx, edx
jnz l0
l1: mov ecx, dword ptr a+4
test ecx, ecx
jz l3
l2: lea edx, [ecx-1]
inc eax
and ecx, edx
jnz l2
l3:
}
}
unsigned FirstBit(unsigned __int64 a) {
__asm {
bsr edx, dword ptr a+4
mov eax, 31
jnz l1
bsr edx, dword ptr a
mov eax, 63
jnz l1
mov edx, -1
l1: sub eax, edx
}
}
unsigned LastBit(unsigned __int64 a) {
__asm {
bsf edx, dword ptr a
mov eax, 63
jnz l1
bsf edx, dword ptr a+4
mov eax, 31
jnz l1
mov edx, -33
l1: sub eax, edx
}
}
/*
* Find number of nonzero bits in a unsigned __int64.
* Defined by Ernst A. Heinz, University of Karlsruhe.
*/
unsigned int CPopulationCount(unsigned __int64 b)
{
unsigned int n;
for (n = 0; b != 0; n++, b &= (b-1));
return n;
}
/*
This one sometimes gives the WRONG ANSWER!!!
Too bad. For some problem sizes, it is fast as
the burning blue blazes.
#define mm1 ((unsigned __int64) 0x5555555555555555)
#define mm2 ((unsigned __int64) 0x3333333333333333)
unsigned int NCPopulationCount(unsigned __int64 b)
{
unsigned int n;
const unsigned __int64 a = b - ((b >> 1) & mm1);
const unsigned __int64 c = (a & mm2) + ((a >> 2) & mm2);
n = ((unsigned int)c) + ((unsigned int)(c >> 32));
n = (n & 0x0F0F0F0F) + ((n >> 4) & 0x0F0F0F);
n = (n & 0xFFFF) + (n >> 16);
n = (n & 0xFF) + (n >> 8);
return n;
}
*/
/*
; population count of a 32 bit register
;
; input:
; esi = number to population count
;
; output:
; eax = population count
;
; destroys:
; esi, eax
; eflags
*/
unsigned popcount32fm(unsigned int input)
{
__asm {
mov esi, input
mov eax,esi
shr eax,1
and eax,055555555h ; (n >> 1) & 0x55555555
sub esi,eax ; n - ((n >> 1) & 0x55555555)
mov eax,esi
shr eax,2 ; n >> 2
and esi,033333333h ; n & 0x33333333
and eax,033333333h ; (n >> 2) & 0x33333333
add esi,eax ; n = (n & 0x33333333) + ((n >> 2) &
0x33333333)
mov eax,esi
shr eax,4 ; n >> 4
add eax,esi ; n + (n >> 4)
and eax,00F0F0F0Fh ; n = (n + (n >> 4) & 0x0F0F0F0F)
imul eax,001010101h ; add by multiplying with a "magic
number"
shr eax,24 ; shift result into place
}
}
unsigned popcount32(unsigned int input)
{
__asm {
mov esi, input
mov eax,esi
shr eax,1
and eax,055555555h ; (n >> 1) & 0x55555555
sub esi,eax ; n - ((n >> 1) & 0x55555555)
mov eax,esi
shr eax,2 ; n >> 2
and esi,033333333h ; n & 0x33333333
and eax,033333333h ; (n >> 2) & 0x33333333
add esi,eax ; n = (n & 0x33333333) + ((n >> 2) &
0x33333333)
mov eax,esi
shr eax,4 ; n >> 4
add eax,esi ; n + (n >> 4)
and eax,00F0F0F0Fh ; n = (n + (n >> 4) & 0x0F0F0F0F)
mov esi,eax
shr esi,8 ; n >> 8
add eax,esi ; n = n + (n >> 8)
mov esi,eax
shr esi,16 ; n >> 16
add eax,esi ; n = n + (n >> 16)
and eax,00000003Fh ; popcount
}
}
typedef union {
unsigned __int64 i64;
unsigned i32[2];
} bb;
unsigned pc(bb b)
{
return (popcount32fm(b.i32[0]) + popcount32fm(b.i32[1]));
}
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>
int rand_range(int N)
{
return (int) ((double) rand() / ((double) RAND_MAX + 1) * N);
}
char string[32767];
#define BITMASK(b) (1 << ((b) % CHAR_BIT))
#define BITSLOT(b) ((b) / CHAR_BIT)
#define BITSET(a, b) ((a)[BITSLOT(b)] |= BITMASK(b))
int main(void)
{
int i, j;
int mbc;
int ntr;
int c1,
c2,
c3,
c4,
c5,
c6,
c7;
bb a;
unsigned char *bar = (unsigned char *) &a.i64;
puts("Please enter the maximum number of bits:");
if (fgets(string, sizeof string, stdin))
mbc = atoi(string);
mbc = max(mbc, 1);
mbc = min(mbc, 32);
puts("Please enter the maximum number of trials:");
if (fgets(string, sizeof string, stdin))
ntr = atoi(string);
for (i = 0; i < ntr; i++) {
a.i64=0;
for (j = 0; j < mbc; j++)
BITSET(bar, rand_range(63));
c1 = DensePopulationCount(a.i64);
c2 = SparsePopulationCount(a.i64);
c3 = CPopulationCount(a.i64);
// c4 = NCPopulationCount(a.i64);
c5 = popcount32fm(a.i32[0]) + popcount32fm(a.i32[1]);
c6 = popcount32(a.i32[0]) + popcount32(a.i32[1]);
// c7 = pc(a);
if (c1 != c2)
printf("c1=%u, c2 = %u\n", c1, c2);
if (c1 != c3)
printf("c1=%u, c3 = %u\n", c1, c3);
// if (c1 != c4)
// printf("c1=%u, c4 = %u\n", c1, c4);
if (c1 != c5)
printf("c1=%u, c5 = %u\n", c1, c5);
if (c1 != c6)
printf("c1=%u, c6 = %u\n", c1, c6);
// if (c1 != c7)
// printf("c1=%u, c7 = %u\n", c1, c7);
}
return 0;
}
And here is the table of the results that I calculated:
BITS CPopulationCount DensePopulationCount Popcount32 Popcount32fm SparsePopulationCount
32 3.08 0.81 1.03 0.99 1.63
31
30
29
28
27
26 3.3 1.01 1.19 1.2 1.98
25
24
23 3.45 2.02 2.62 2.77 2.37
22
21
20
19
18 3.4 1.36 1.74 1.71 2.07
17
16
15
14
13 3.41 1.22 1.49 1.53 1.98
12
11 3.46 1.12 1.3 1.4 2.06
10
9
8 3.33 0.93 1.22 1.24 1.95
7 3.22 0.91 1.11 1.19 1.71
6 3.16 1.12 1.3 1.4 2.06
5 3.04 0.88 1.09 1.08 1.8
4 3.2 0.87 0.99 1.05 1.73
3 2.99 0.79 1.02 1.06 1.77
2 3.07 0.85 1.03 1.01 1.76
The gaps represent places were no measurements were taken. I used the data to
draw graphs using Microsoft Excell, which is incredibly dense as to what x-axis
data might possibly mean (everything is a text label).
As you can see, DensePopulationCount wins on *my* machine for all problem sizes
up to the largest size that will occur on a real chess board. I suspected it
would be fastest for dense bitsets, as you can see by my ill-chosen name). At
any rate, this varies quite a bit from machine to machine, with some (clearly)
not even being able to run it -- even amongst intel-type architectures.
So, as always, YMMV.
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.