Author: Dieter Buerssner
Date: 09:34:22 04/19/03
Go up one level in this thread
On April 19, 2003 at 05:29:13, Gerd Isenberg wrote: >Yes Dieter, i tried a lot. Gerd, I have no doubt about this. I only questioned the 2 specific routines (the loops on the 2 32 bit parts, once in assembler, once in C). I questioned it only, because I looked at the generated assembler of the C-Code, which was practically identical, to the assembly you showed. Anyway, I made a small test program. dieter() is the C-Code I had shown, gerd() your assembly (I wrote the same in GCC-style, too), and crafty() the routine Uri showed in this thread. In my test, gerd() was not faster than dieter() (dieter() was actually slightly faster than gerd() using MSVC, and the same using gcc). As should be expected, with this specific test, crafty() was a bit slower (much more with gcc than with MSVC). I used -O2 for both compilers. No doubt, one can argue about my testing procedure. I actually made it out of fun only, not to do some scientific experiments. I also used a union for bitboard, and changed for this the functions in the thread slightly. I just did not want to try which tricks would work best to convince the compiler to extract the 2 32-bit parts efficiently. Regards, Dieter #ifdef __GNUC__ #define MYINLINE static __inline__ typedef unsigned long long BITBOARD; #else #define MYINLINE static __forceinline typedef unsigned __int64 BITBOARD; #endif /* Note, this is not strictly portable, even not the pure C code*/ typedef union { BITBOARD b; unsigned long w[2]; } BU; MYINLINE int crafty(register BU a) { register int c=0; while(a.b) { c++; a.b &= a.b - 1; } return(c); } MYINLINE int dieter(BU a) { unsigned long w; int n = 0; w = a.w[0]; if (w) do { n++; } while ((w &= w-1) != 0); w = a.w[1]; if (w) do { n++; } while ((w &= w-1) != 0); return n; } #ifdef __GNUC__ MYINLINE int gerd(BU a) { int tmp, tmp2, n; __asm__ __volatile__( " movl %3, %1 xorl %0, %0 testl %1, %1 je 1f 0: incl %0 leal -1(%1), %2 andl %2, %1 jne 0b 1: movl %4, %1 testl %1, %1 je 3f 2: incl %0 leal -1(%1), %2 andl %2, %1 jne 2b 3:" : "=r&" (n), "=r&" (tmp), "=r&" (tmp2) : "g" (a.w[0]), "g" (a.w[1]) : "cc" /* Flags "condition code" changed */); return n; } #else MYINLINE int gerd (BU bb) { __asm { mov ecx, dword ptr bb xor eax, eax test ecx, ecx jz l1 l0: lea edx, [ecx-1] inc eax and ecx, edx jnz l0 l1: mov ecx, dword ptr bb+4 test ecx, ecx jz l3 l2: lea edx, [ecx-1] inc eax and ecx, edx jnz l2 l3: } } #endif #include <stdlib.h> #include <stdio.h> #include <time.h> #define NLOOPS 10000000L #define NBB 100 /* Number of different bitboards */ #define NBITS 4 /* That many bits set randomly, can be less */ #define PRNG rand #define MY_RAND_MAX RAND_MAX unsigned long rand_range(unsigned long range) { unsigned long rmax, r, d; /* find the largest number rmax <= MY_RAND_MAX, for which (rmax+1) % range == 0. All returns from rand() > rmax will be skipped, to guarantee equal probability for all return values. */ d = (MY_RAND_MAX+1U-range) / range + 1; rmax = d * range - 1; /* -1 to avoid "overflow to zero" */ do r = PRNG(); while (r > rmax); return r/d; } BU bbs[NBB]; /* Define functions in "wrong" order, to not let the compiler do stupid inlining, that can make the code slower */ unsigned long foo(void); unsigned long bar(void); int main(void) { int i, j; clock_t c; /* Set some random bitboards */ for (i=0; i<NBB; i++) for (j=0; j<NBITS; j++) bbs[i].b |= (BITBOARD)1 << rand_range(64); c = clock(); /* Print number of counted bits, so we get an indicication, when something goes wrong. My easily overflow. */ printf("res %lu\n", bar()); c = clock()-c; printf("used %.3f s\n", (double)c/CLOCKS_PER_SEC); return 0; } unsigned long bar(void) { long i; unsigned long n=0; for (i=0; i<NLOOPS; i++) n += foo(); return n; } unsigned long foo(void) { int i; unsigned long n=0; for (i=0; i<NBB; i++) n += crafty(bbs[i]); return n; }
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.