Author: Dann Corbit
Date: 02:03:59 06/15/01
Go up one level in this thread
This is the test driver I used to bench bit operations: #include "amd3dx.h" /* Athlon MMX optimized popcount function, based on AMD's Athlon optimization manuals */ unsigned DensePopulationCount (unsigned __int64 v) { static const __int64 C55 = 0x5555555555555555; static const __int64 C33 = 0x3333333333333333; static const __int64 C0F = 0x0F0F0F0F0F0F0F0F; __asm { movd mm0, word ptr v; punpckldq mm0, word ptr v + 4; movq mm1,mm0; psrld mm0,1; pand mm0,[C55]; psubd mm1,mm0; movq mm0,mm1; psrld mm1,2; pand mm0,[C33]; pand mm1,[C33]; paddd mm0,mm1; movq mm1,mm0; psrld mm0,4; paddd mm0,mm1; pand mm0,[C0F]; pxor mm1,mm1; psadbw (mm0,mm1); movd eax,mm0; emms; } } /* From Crafty, this is for sparsely populated bitmaps */ unsigned SparsePopulationCount (unsigned __int64 a) { __asm { mov ecx, dword ptr a xor eax, eax test ecx, ecx jz l1 l0: lea edx, [ecx-1] inc eax and ecx, edx jnz l0 l1: mov ecx, dword ptr a+4 test ecx, ecx jz l3 l2: lea edx, [ecx-1] inc eax and ecx, edx jnz l2 l3: } } unsigned FirstBit(unsigned __int64 a) { __asm { bsr edx, dword ptr a+4 mov eax, 31 jnz l1 bsr edx, dword ptr a mov eax, 63 jnz l1 mov edx, -1 l1: sub eax, edx } } unsigned LastBit(unsigned __int64 a) { __asm { bsf edx, dword ptr a mov eax, 63 jnz l1 bsf edx, dword ptr a+4 mov eax, 31 jnz l1 mov edx, -33 l1: sub eax, edx } } unsigned int first_piece[65536]; #define TwoFullRanks (65535) static unsigned __int64 FPMask1; static unsigned __int64 FPMask2; int ifp() { unsigned i, j; FPMask1 = ((unsigned __int64 ) TwoFullRanks << 16); FPMask2 = ((unsigned __int64 ) TwoFullRanks << 32); /* Lookup tables for the first- & last- piece functions */ for (i = 0; i < 65536; i++) { /* First piece in a two rank chunk */ for (j = 0; j < 16; j++) { if (i & (1 << j)) { first_piece[i] = j; break; } } } } unsigned FirstPiece(unsigned __int64 B) { if (B & TwoFullRanks) return first_piece[B & TwoFullRanks]; if (B & FPMask1) return first_piece[(B >> 16) & TwoFullRanks] + 16; if (B & FPMask2) return first_piece[(B >> 32) & TwoFullRanks] + 32; return first_piece[B >> 48] + 48; } unsigned firstb(unsigned __int64 bits) { __asm { ; ; This very clever first bit routine (much too clever for me) ; is due to Terje Mathisen ; mov ecx, DWORD PTR [bits+4] mov edx, 0 mov ebx, DWORD PTR [bits] mov eax, 0 ; The result should be [0..63] if the input was non-zero, -1 otherwise ; Input in ECX:EBX bsf edx,ecx ; Bit index for second half bsf eax,ebx ; Bit index of first half add edx,32 ; Add 32 to the second half index sub ebx,1 sbb ebx,ebx ; -1 if EBX (first half) was zero sub ecx,1 sbb ecx,ecx ; -1 if ECX (second half) was zero and edx,ebx ; EDX will be zero unless low half was zero add eax,edx ; return (EAX != 0): EAX : EDX+32 and ebx,ecx ; -1 if all 64 bits were zero or eax,ebx } } unsigned fb(unsigned __int64 x) { return (unsigned)(x & (-x)); } unsigned firstbit(unsigned __int64 bits) { __asm { ; 64-bit number to move into ECX:EBX, will never be zero! mov ecx,dword ptr [bits+4] mov ebx,dword ptr [bits] bsf eax,ecx add eax,32 bsf eax,ebx } } unsigned first(unsigned __int64 bits) { __asm { mov ecx, DWORD PTR [bits+4] mov edx, 0 mov ebx, DWORD PTR [bits] mov eax, 0 ; The result should be [0..63] if the input was non-zero, -1 otherwise bsf edx,ecx ; Bit index for second half mov eax, -33 ; Need edx to be -1 if ecx was zero cmovz edx, eax ; Now we are ready for an empty qword bsf eax,ebx ; Bit index of first half lea edx,+32[edx] ; Add 32 to the second half index cmovz eax, edx ; Predicated move and we are done } } /* * Find number of nonzero bits in a unsigned __int64. * Defined by Ernst A. Heinz, University of Karlsruhe. */ unsigned int CPopulationCount(unsigned __int64 b) { unsigned int n; for (n = 0; b != 0; n++, b &= (b-1)); return n; } /* This one sometimes gives the WRONG ANSWER!!! #define mm1 ((unsigned __int64) 0x5555555555555555) #define mm2 ((unsigned __int64) 0x3333333333333333) unsigned int NCPopulationCount(unsigned __int64 b) { unsigned int n; const unsigned __int64 a = b - ((b >> 1) & mm1); const unsigned __int64 c = (a & mm2) + ((a >> 2) & mm2); n = ((unsigned int)c) + ((unsigned int)(c >> 32)); n = (n & 0x0F0F0F0F) + ((n >> 4) & 0x0F0F0F); n = (n & 0xFFFF) + (n >> 16); n = (n & 0xFF) + (n >> 8); return n; } */ /* ; population count of a 32 bit register ; ; input: ; esi = number to population count ; ; output: ; eax = population count ; ; destroys: ; esi, eax ; eflags */ unsigned popcount32fm(unsigned int input) { __asm { mov esi, input mov eax,esi shr eax,1 and eax,055555555h ; (n >> 1) & 0x55555555 sub esi,eax ; n - ((n >> 1) & 0x55555555) mov eax,esi shr eax,2 ; n >> 2 and esi,033333333h ; n & 0x33333333 and eax,033333333h ; (n >> 2) & 0x33333333 add esi,eax ; n = (n & 0x33333333) + ((n >> 2) & 0x33333333) mov eax,esi shr eax,4 ; n >> 4 add eax,esi ; n + (n >> 4) and eax,00F0F0F0Fh ; n = (n + (n >> 4) & 0x0F0F0F0F) imul eax,001010101h ; add by multiplying with a "magic number" shr eax,24 ; shift result into place } } unsigned popcount32(unsigned int input) { __asm { mov esi, input mov eax,esi shr eax,1 and eax,055555555h ; (n >> 1) & 0x55555555 sub esi,eax ; n - ((n >> 1) & 0x55555555) mov eax,esi shr eax,2 ; n >> 2 and esi,033333333h ; n & 0x33333333 and eax,033333333h ; (n >> 2) & 0x33333333 add esi,eax ; n = (n & 0x33333333) + ((n >> 2) & 0x33333333) mov eax,esi shr eax,4 ; n >> 4 add eax,esi ; n + (n >> 4) and eax,00F0F0F0Fh ; n = (n + (n >> 4) & 0x0F0F0F0F) mov esi,eax shr esi,8 ; n >> 8 add eax,esi ; n = n + (n >> 8) mov esi,eax shr esi,16 ; n >> 16 add eax,esi ; n = n + (n >> 16) and eax,00000003Fh ; popcount } } typedef union { unsigned __int64 i64; unsigned i32[2]; } bb; unsigned pc(bb b) { return (popcount32fm(b.i32[0]) + popcount32fm(b.i32[1])); } #include <stdlib.h> #include <stdio.h> #include <string.h> #include <limits.h> int rand_range(int N) { return (int) ((double) rand() / ((double) RAND_MAX + 1) * N); } char string[32767]; #define BITMASK(b) (1 << ((b) % CHAR_BIT)) #define BITSLOT(b) ((b) / CHAR_BIT) #define BITSET(a, b) ((a)[BITSLOT(b)] |= BITMASK(b)) int main(void) { int i, j; int mbc; int ntr; int c1, c2, c3, c4, c5, c6, c7; int a1=1, a2=1, a3=1, a4=1, a5=1, a6=1, a7=1; bb a; unsigned char *bar = (unsigned char *) &a.i64; ifp(); puts("Please enter the maximum number of bits:"); if (fgets(string, sizeof string, stdin)) mbc = atoi(string); mbc = max(mbc, 1); mbc = min(mbc, 32); puts("Please enter the maximum number of trials:"); if (fgets(string, sizeof string, stdin)) ntr = atoi(string); for (i = 0; i < ntr; i++) { a.i64=0; for (j = 0; j < mbc; j++) BITSET(bar, rand_range(63)); #ifdef TEST_BIT_COUNT c1 = DensePopulationCount(a.i64); c2 = SparsePopulationCount(a.i64); c3 = CPopulationCount(a.i64); // c4 = NCPopulationCount(a.i64); c5 = popcount32fm(a.i32[0]) + popcount32fm(a.i32[1]); c6 = popcount32(a.i32[0]) + popcount32(a.i32[1]); // c7 = pc(a); if (c1 != c2) printf("c1=%u, c2 = %u\n", c1, c2); if (c1 != c3) printf("c1=%u, c3 = %u\n", c1, c3); // if (c1 != c4) // printf("c1=%u, c4 = %u\n", c1, c4); if (c1 != c5) printf("c1=%u, c5 = %u\n", c1, c5); if (c1 != c6) printf("c1=%u, c6 = %u\n", c1, c6); // if (c1 != c7) // printf("c1=%u, c7 = %u\n", c1, c7); #else c3 = FirstPiece(a.i64); c4 = firstb(a.i64); c6 = firstbit(a.i64); c7 = first(a.i64); a3 &= c3 == c3; a4 &= c4 == c3; a6 &= c6 == c3; a7 &= c7 == c3; #ifdef _DEBUG printf("n=%x %x, c3 = %d, c4 = %d, c6 = %d, c7 = %d\n", a.i32[0], a.i32[1], c3, c4, c6, c7); #endif #endif } if (a3) puts("c3 agrees with c3"); if (a4) puts("c4 agrees with c3"); if (a6) puts("c6 agrees with c3"); if (a7) puts("c7 agrees with c3"); return 0; }
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.