Author: Dieter Buerssner
Date: 09:34:22 04/19/03
Go up one level in this thread
On April 19, 2003 at 05:29:13, Gerd Isenberg wrote:
>Yes Dieter, i tried a lot.
Gerd, I have no doubt about this. I only questioned the 2 specific routines (the
loops on the 2 32 bit parts, once in assembler, once in C). I questioned it
only, because I looked at the generated assembler of the C-Code, which was
practically identical, to the assembly you showed.
Anyway, I made a small test program. dieter() is the C-Code I had shown, gerd()
your assembly (I wrote the same in GCC-style, too), and crafty() the routine Uri
showed in this thread. In my test, gerd() was not faster than dieter() (dieter()
was actually slightly faster than gerd() using MSVC, and the same using gcc). As
should be expected, with this specific test, crafty() was a bit slower (much
more with gcc than with MSVC). I used -O2 for both compilers. No doubt, one can
argue about my testing procedure. I actually made it out of fun only, not to do
some scientific experiments. I also used a union for bitboard, and changed for
this the functions in the thread slightly. I just did not want to try which
tricks would work best to convince the compiler to extract the 2 32-bit parts
efficiently.
Regards,
Dieter
#ifdef __GNUC__
#define MYINLINE static __inline__
typedef unsigned long long BITBOARD;
#else
#define MYINLINE static __forceinline
typedef unsigned __int64 BITBOARD;
#endif
/* Note, this is not strictly portable, even not the pure C code*/
typedef union
{
BITBOARD b;
unsigned long w[2];
} BU;
MYINLINE int crafty(register BU a) {
register int c=0;
while(a.b) {
c++;
a.b &= a.b - 1;
}
return(c);
}
MYINLINE int dieter(BU a)
{
unsigned long w;
int n = 0;
w = a.w[0];
if (w)
do
{
n++;
}
while ((w &= w-1) != 0);
w = a.w[1];
if (w)
do
{
n++;
}
while ((w &= w-1) != 0);
return n;
}
#ifdef __GNUC__
MYINLINE int gerd(BU a)
{
int tmp, tmp2, n;
__asm__ __volatile__(
" movl %3, %1
xorl %0, %0
testl %1, %1
je 1f
0: incl %0
leal -1(%1), %2
andl %2, %1
jne 0b
1: movl %4, %1
testl %1, %1
je 3f
2: incl %0
leal -1(%1), %2
andl %2, %1
jne 2b
3:"
: "=r&" (n), "=r&" (tmp), "=r&" (tmp2)
: "g" (a.w[0]), "g" (a.w[1])
: "cc" /* Flags "condition code" changed */);
return n;
}
#else
MYINLINE int gerd (BU bb)
{
__asm
{
mov ecx, dword ptr bb
xor eax, eax
test ecx, ecx
jz l1
l0: lea edx, [ecx-1]
inc eax
and ecx, edx
jnz l0
l1: mov ecx, dword ptr bb+4
test ecx, ecx
jz l3
l2: lea edx, [ecx-1]
inc eax
and ecx, edx
jnz l2
l3:
}
}
#endif
#include <stdlib.h>
#include <stdio.h>
#include <time.h>
#define NLOOPS 10000000L
#define NBB 100 /* Number of different bitboards */
#define NBITS 4 /* That many bits set randomly, can be less */
#define PRNG rand
#define MY_RAND_MAX RAND_MAX
unsigned long rand_range(unsigned long range)
{
unsigned long rmax, r, d;
/* find the largest number rmax <= MY_RAND_MAX, for which
(rmax+1) % range == 0.
All returns from rand() > rmax will be skipped, to guarantee
equal probability for all return values. */
d = (MY_RAND_MAX+1U-range) / range + 1;
rmax = d * range - 1; /* -1 to avoid "overflow to zero" */
do
r = PRNG();
while (r > rmax);
return r/d;
}
BU bbs[NBB];
/* Define functions in "wrong" order, to not let the compiler
do stupid inlining, that can make the code slower */
unsigned long foo(void);
unsigned long bar(void);
int main(void)
{
int i, j;
clock_t c;
/* Set some random bitboards */
for (i=0; i<NBB; i++)
for (j=0; j<NBITS; j++)
bbs[i].b |= (BITBOARD)1 << rand_range(64);
c = clock();
/* Print number of counted bits, so we get an indicication,
when something goes wrong. My easily overflow. */
printf("res %lu\n", bar());
c = clock()-c;
printf("used %.3f s\n", (double)c/CLOCKS_PER_SEC);
return 0;
}
unsigned long bar(void)
{
long i;
unsigned long n=0;
for (i=0; i<NLOOPS; i++)
n += foo();
return n;
}
unsigned long foo(void)
{
int i;
unsigned long n=0;
for (i=0; i<NBB; i++)
n += crafty(bbs[i]);
return n;
}
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.