Author: Dann Corbit
Date: 02:03:59 06/15/01
Go up one level in this thread
This is the test driver I used to bench bit operations:
#include "amd3dx.h"
/* Athlon MMX optimized popcount function, based on AMD's Athlon
optimization manuals */
unsigned DensePopulationCount (unsigned __int64 v)
{
static const __int64 C55 = 0x5555555555555555;
static const __int64 C33 = 0x3333333333333333;
static const __int64 C0F = 0x0F0F0F0F0F0F0F0F;
__asm {
movd mm0, word ptr v;
punpckldq mm0, word ptr v + 4;
movq mm1,mm0;
psrld mm0,1;
pand mm0,[C55];
psubd mm1,mm0;
movq mm0,mm1;
psrld mm1,2;
pand mm0,[C33];
pand mm1,[C33];
paddd mm0,mm1;
movq mm1,mm0;
psrld mm0,4;
paddd mm0,mm1;
pand mm0,[C0F];
pxor mm1,mm1;
psadbw (mm0,mm1);
movd eax,mm0;
emms;
}
}
/* From Crafty, this is for sparsely populated bitmaps */
unsigned SparsePopulationCount (unsigned __int64 a) {
__asm {
mov ecx, dword ptr a
xor eax, eax
test ecx, ecx
jz l1
l0: lea edx, [ecx-1]
inc eax
and ecx, edx
jnz l0
l1: mov ecx, dword ptr a+4
test ecx, ecx
jz l3
l2: lea edx, [ecx-1]
inc eax
and ecx, edx
jnz l2
l3:
}
}
unsigned FirstBit(unsigned __int64 a) {
__asm {
bsr edx, dword ptr a+4
mov eax, 31
jnz l1
bsr edx, dword ptr a
mov eax, 63
jnz l1
mov edx, -1
l1: sub eax, edx
}
}
unsigned LastBit(unsigned __int64 a) {
__asm {
bsf edx, dword ptr a
mov eax, 63
jnz l1
bsf edx, dword ptr a+4
mov eax, 31
jnz l1
mov edx, -33
l1: sub eax, edx
}
}
unsigned int first_piece[65536];
#define TwoFullRanks (65535)
static unsigned __int64 FPMask1;
static unsigned __int64 FPMask2;
int ifp()
{
unsigned i, j;
FPMask1 = ((unsigned __int64 ) TwoFullRanks << 16);
FPMask2 = ((unsigned __int64 ) TwoFullRanks << 32);
/* Lookup tables for the first- & last- piece functions */
for (i = 0; i < 65536; i++) {
/* First piece in a two rank chunk */
for (j = 0; j < 16; j++) {
if (i & (1 << j)) {
first_piece[i] = j;
break;
}
}
}
}
unsigned FirstPiece(unsigned __int64 B)
{
if (B & TwoFullRanks)
return first_piece[B & TwoFullRanks];
if (B & FPMask1)
return first_piece[(B >> 16) & TwoFullRanks] + 16;
if (B & FPMask2)
return first_piece[(B >> 32) & TwoFullRanks] + 32;
return first_piece[B >> 48] + 48;
}
unsigned firstb(unsigned __int64 bits)
{
__asm {
;
; This very clever first bit routine (much too clever for me)
; is due to Terje Mathisen
;
mov ecx, DWORD PTR [bits+4]
mov edx, 0
mov ebx, DWORD PTR [bits]
mov eax, 0
; The result should be [0..63] if the input was non-zero, -1 otherwise
; Input in ECX:EBX
bsf edx,ecx ; Bit index for second half
bsf eax,ebx ; Bit index of first half
add edx,32 ; Add 32 to the second half index
sub ebx,1
sbb ebx,ebx ; -1 if EBX (first half) was zero
sub ecx,1
sbb ecx,ecx ; -1 if ECX (second half) was zero
and edx,ebx ; EDX will be zero unless low half was zero
add eax,edx ; return (EAX != 0): EAX : EDX+32
and ebx,ecx ; -1 if all 64 bits were zero
or eax,ebx
}
}
unsigned fb(unsigned __int64 x)
{
return (unsigned)(x & (-x));
}
unsigned firstbit(unsigned __int64 bits)
{
__asm {
; 64-bit number to move into ECX:EBX, will never be zero!
mov ecx,dword ptr [bits+4]
mov ebx,dword ptr [bits]
bsf eax,ecx
add eax,32
bsf eax,ebx
}
}
unsigned first(unsigned __int64 bits)
{
__asm {
mov ecx, DWORD PTR [bits+4]
mov edx, 0
mov ebx, DWORD PTR [bits]
mov eax, 0
; The result should be [0..63] if the input was non-zero, -1 otherwise
bsf edx,ecx ; Bit index for second half
mov eax, -33 ; Need edx to be -1 if ecx was zero
cmovz edx, eax ; Now we are ready for an empty qword
bsf eax,ebx ; Bit index of first half
lea edx,+32[edx] ; Add 32 to the second half index
cmovz eax, edx ; Predicated move and we are done
}
}
/*
* Find number of nonzero bits in a unsigned __int64.
* Defined by Ernst A. Heinz, University of Karlsruhe.
*/
unsigned int CPopulationCount(unsigned __int64 b)
{
unsigned int n;
for (n = 0; b != 0; n++, b &= (b-1));
return n;
}
/*
This one sometimes gives the WRONG ANSWER!!!
#define mm1 ((unsigned __int64) 0x5555555555555555)
#define mm2 ((unsigned __int64) 0x3333333333333333)
unsigned int NCPopulationCount(unsigned __int64 b)
{
unsigned int n;
const unsigned __int64 a = b - ((b >> 1) & mm1);
const unsigned __int64 c = (a & mm2) + ((a >> 2) & mm2);
n = ((unsigned int)c) + ((unsigned int)(c >> 32));
n = (n & 0x0F0F0F0F) + ((n >> 4) & 0x0F0F0F);
n = (n & 0xFFFF) + (n >> 16);
n = (n & 0xFF) + (n >> 8);
return n;
}
*/
/*
; population count of a 32 bit register
;
; input:
; esi = number to population count
;
; output:
; eax = population count
;
; destroys:
; esi, eax
; eflags
*/
unsigned popcount32fm(unsigned int input)
{
__asm {
mov esi, input
mov eax,esi
shr eax,1
and eax,055555555h ; (n >> 1) & 0x55555555
sub esi,eax ; n - ((n >> 1) & 0x55555555)
mov eax,esi
shr eax,2 ; n >> 2
and esi,033333333h ; n & 0x33333333
and eax,033333333h ; (n >> 2) & 0x33333333
add esi,eax ; n = (n & 0x33333333) + ((n >> 2) &
0x33333333)
mov eax,esi
shr eax,4 ; n >> 4
add eax,esi ; n + (n >> 4)
and eax,00F0F0F0Fh ; n = (n + (n >> 4) & 0x0F0F0F0F)
imul eax,001010101h ; add by multiplying with a "magic
number"
shr eax,24 ; shift result into place
}
}
unsigned popcount32(unsigned int input)
{
__asm {
mov esi, input
mov eax,esi
shr eax,1
and eax,055555555h ; (n >> 1) & 0x55555555
sub esi,eax ; n - ((n >> 1) & 0x55555555)
mov eax,esi
shr eax,2 ; n >> 2
and esi,033333333h ; n & 0x33333333
and eax,033333333h ; (n >> 2) & 0x33333333
add esi,eax ; n = (n & 0x33333333) + ((n >> 2) &
0x33333333)
mov eax,esi
shr eax,4 ; n >> 4
add eax,esi ; n + (n >> 4)
and eax,00F0F0F0Fh ; n = (n + (n >> 4) & 0x0F0F0F0F)
mov esi,eax
shr esi,8 ; n >> 8
add eax,esi ; n = n + (n >> 8)
mov esi,eax
shr esi,16 ; n >> 16
add eax,esi ; n = n + (n >> 16)
and eax,00000003Fh ; popcount
}
}
typedef union {
unsigned __int64 i64;
unsigned i32[2];
} bb;
unsigned pc(bb b)
{
return (popcount32fm(b.i32[0]) + popcount32fm(b.i32[1]));
}
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>
int rand_range(int N)
{
return (int) ((double) rand() / ((double) RAND_MAX + 1) * N);
}
char string[32767];
#define BITMASK(b) (1 << ((b) % CHAR_BIT))
#define BITSLOT(b) ((b) / CHAR_BIT)
#define BITSET(a, b) ((a)[BITSLOT(b)] |= BITMASK(b))
int main(void)
{
int i, j;
int mbc;
int ntr;
int c1,
c2,
c3,
c4,
c5,
c6,
c7;
int a1=1,
a2=1,
a3=1,
a4=1,
a5=1,
a6=1,
a7=1;
bb a;
unsigned char *bar = (unsigned char *) &a.i64;
ifp();
puts("Please enter the maximum number of bits:");
if (fgets(string, sizeof string, stdin))
mbc = atoi(string);
mbc = max(mbc, 1);
mbc = min(mbc, 32);
puts("Please enter the maximum number of trials:");
if (fgets(string, sizeof string, stdin))
ntr = atoi(string);
for (i = 0; i < ntr; i++) {
a.i64=0;
for (j = 0; j < mbc; j++)
BITSET(bar, rand_range(63));
#ifdef TEST_BIT_COUNT
c1 = DensePopulationCount(a.i64);
c2 = SparsePopulationCount(a.i64);
c3 = CPopulationCount(a.i64);
// c4 = NCPopulationCount(a.i64);
c5 = popcount32fm(a.i32[0]) + popcount32fm(a.i32[1]);
c6 = popcount32(a.i32[0]) + popcount32(a.i32[1]);
// c7 = pc(a);
if (c1 != c2)
printf("c1=%u, c2 = %u\n", c1, c2);
if (c1 != c3)
printf("c1=%u, c3 = %u\n", c1, c3);
// if (c1 != c4)
// printf("c1=%u, c4 = %u\n", c1, c4);
if (c1 != c5)
printf("c1=%u, c5 = %u\n", c1, c5);
if (c1 != c6)
printf("c1=%u, c6 = %u\n", c1, c6);
// if (c1 != c7)
// printf("c1=%u, c7 = %u\n", c1, c7);
#else
c3 = FirstPiece(a.i64);
c4 = firstb(a.i64);
c6 = firstbit(a.i64);
c7 = first(a.i64);
a3 &= c3 == c3;
a4 &= c4 == c3;
a6 &= c6 == c3;
a7 &= c7 == c3;
#ifdef _DEBUG
printf("n=%x %x, c3 = %d, c4 = %d, c6 = %d, c7 = %d\n",
a.i32[0], a.i32[1], c3, c4, c6, c7);
#endif
#endif
}
if (a3) puts("c3 agrees with c3");
if (a4) puts("c4 agrees with c3");
if (a6) puts("c6 agrees with c3");
if (a7) puts("c7 agrees with c3");
return 0;
}
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.