Computer Chess Club Archives


Search

Terms

Messages

Subject: Re: Population of disjoint Attacksets

Author: Gerd Isenberg

Date: 07:07:42 05/31/04

Go up one level in this thread


On May 30, 2004 at 16:42:53, Dieter Buerssner wrote:

>Perhaps your results are not strange at all. Your function just seems to be fast
>when everything is inlined :-) On my computer, my method is a bit faster.
>
>I tried a few things. Some observations. With MSVC 6, changing your odd() and
>maj() functions to macros gives reproducably faster times here. 4.24 vs 4.49 ns
>per 32-bit popcount (This would translate to 84.8 vs 89.8 s in your 1e9 loop). I
>also used clock() for timing, but did it slightly different than you with some
>#define tricks, so that each function will be called inside exactly the same
>environment (each test run for only one function).
>
>With Gcc, using inline instead of the macro, was faster ...
>
>My pc_10bb needs 3.94 ns. It was coded not very well. One can also code it
>without the need of temporary arrays (code even gets smaller). Both Gcc and MSVC
>can translate it, so that it will need 6 registers (so no temporary space on the
>stack at all). An unrolled version will need 5 registers (about 280
>instructions). I show the loop version at the end.
>
>This somewhat more carfully loop version needs 3.35ns. Unrolled 3.08 ns. This is
>only 7.8 cycles per 32-bit popcount. The fastest I got your function, was 4.24
>ns/32-bit word (using MSVC, macro odd/maj and a version of popcount similar to
>yours, but without multiplication, and adding the counters one stage earlier -
>all called functions inlined).

Amazing Dieter!

I tried a MMX-version based on the "dead slow" popcount of amd's optimization
manual, with the eight add,major pairs. Even that takes about 42ns -> 2.1
ns/32-bit. To get an idea what is possible with AMD64 gp registers!

Cheers,
Gerd


#define CACHE_LINE  32
#define CACHE_ALIGN __declspec(align(CACHE_LINE))

__forceinline
int popCount10(BitBoard a[])
{
	struct SBitCountConsts
	{
		BitBoard C33;
		BitBoard C55;
		BitBoard C0F;
	};

	const SBitCountConsts CACHE_ALIGN BitCountConsts =
	{
		0x3333333333333333,	// C33
		0x5555555555555555,	// C55
		0x0f0f0f0f0f0f0f0f,	// C0F
	};

	__asm
	{
		mov	eax, [a]
//	mm0 t1   = odd( a[0], a[1], a[2]);
//	mm1 t2   = maj( a[0], a[1], a[2]);
		movq	mm0, [eax]
		movq	mm7, [eax]
		pxor	mm0, [eax+8]	; x^y
		pand	mm7, [eax+8]    ; x&y
		movq	mm1, mm0	; x^y
		pxor	mm0, [eax+16]	; t1
		pand	mm1, [eax+16]   ; (x^y)&z
		por	mm1, mm7	; t2
//	mm2 u1   = odd( a[3], a[4], a[5]);
//	mm3 u2   = maj( a[3], a[4], a[5]);
		movq	mm2, [eax+24]
		movq	mm7, [eax+24]
		pxor	mm2, [eax+32]	; x^y
		pand	mm7, [eax+32]   ; x&y
		movq	mm3, mm2	; x^y
		pand	mm3, [eax+40]   ; (x^y)&z
		pxor	mm2, [eax+40]
		por	mm3, mm7
//	mm4 v1   = odd( a[6], a[7], a[8]);
//	mm5 v2   = maj( a[6], a[7], a[8]);
		movq	mm4, [eax+48]
		movq	mm7, [eax+48]
		pxor	mm4, [eax+56]	; x^y
		pand	mm7, [eax+56]   ; x&y
		movq	mm5, mm4	; x^y
		pand	mm5, [eax+64]   ; (x^y)&z
		pxor	mm4, [eax+64]	; v1
		por	mm5, mm7	; v2
//	w1   = odd( a[9], t1, u1);
//	w2   = maj( a[9], t1, u1);
		movq	mm6, [eax+72]
		movq	mm7, [eax+72]
		pxor	mm6, mm0	; x^y
		pand	mm7, mm0        ; x&y
		movq	mm0, mm6	; x^y
		pand	mm0, mm2        ; (x^y)&z
		pxor	mm6, mm2        ; w1
		por	mm7, mm0        ; w2
//	c[0] = odd(v1, w1, 0);
//	x2   = maj(v1, w1, 0);
		movq	mm0, mm6
		pand	mm6, mm4	; x2
		pxor	mm0, mm4	; mm0 -> c0
//	y2   = odd(t2, u2, v2);
//	y4   = maj(t2, u2, v2);
		movq	mm2, mm1
		pxor	mm1, mm3	; x^y
		pand	mm2, mm3        ; x&y
		movq	mm3, mm1	; x^y
		pand	mm3, mm5        ; (x^y)&z
		pxor	mm1, mm5        ; y2
		por	mm3, mm2        ; y4
//	c[1] = odd(w2, x2, y2);
//	z4   = maj(w2, x2, y2);
		movq	mm2, mm1
		pxor	mm2, mm7	; x^y
		pand	mm1, mm7        ; x&y
		movq	mm4, mm2	; x^y
		pand	mm4, mm6        ; (x^y)&z
		pxor	mm2, mm6        ; mm2 -> c1
		por	mm4, mm1        ; z4
//	c[2] = odd(y4, z4, 0);
//	c[3] = maj(y4, z4, 0);
		lea	eax, [BitCountConsts]
		movq	mm6, mm4
		pxor	mm4, mm3	; mm4 -> c2
		pand	mm6, mm3	; mm6 -> c3

		movq	mm1,mm0
		movq	mm3,mm2
		movq	mm5,mm4
		movq	mm7,mm6

		psrld	mm0,1
		psrld	mm2,1
		psrld	mm4,1
		psrld	mm6,1

		pand	mm0,[eax].C55
		pand	mm2,[eax].C55
		pand	mm4,[eax].C55
		pand	mm6,[eax].C55

		psubb	mm1,mm0
		psubb	mm3,mm2
		psubb	mm5,mm4
		psubb	mm7,mm6

		movq	mm0,mm1
		movq	mm2,mm3
		movq	mm4,mm5
		movq	mm6,mm7

		psrld	mm1,2
		psrld	mm3,2
		psrld	mm5,2
		psrld	mm7,2

		pand	mm0,[eax].C33
		pand	mm1,[eax].C33
		pand	mm2,[eax].C33
		pand	mm3,[eax].C33
		pand	mm4,[eax].C33
		pand	mm5,[eax].C33
		pand	mm6,[eax].C33
		pand	mm7,[eax].C33

		paddb	mm0,mm1
		paddb	mm2,mm3
		paddb	mm4,mm5
		paddb	mm6,mm7

		movq	mm1,mm0
		movq	mm3,mm2
		movq	mm5,mm4
		movq	mm7,mm6

		psrld	mm0,4
		psrld	mm2,4
		psrld	mm4,4
		psrld	mm6,4

		paddb	mm0,mm1
		paddb	mm2,mm3
		paddb	mm4,mm5
		paddb	mm6,mm7

		pand	mm0,[eax].C0F
		pand	mm2,[eax].C0F
		pand	mm4,[eax].C0F
		pand	mm6,[eax].C0F

		pxor	mm1,mm1

		paddb	mm6,mm6
		paddb	mm6,mm4
		paddb	mm6,mm6
		paddb	mm6,mm2
		paddb	mm6,mm6
		paddb	mm6,mm0

		psadbw	mm6,mm1	; add all bytes
		movd	eax,mm6
	}
}





This page took 0 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.