Computer Chess Club Archives


Search

Terms

Messages

Subject: Re: RankAttacks with x^(x-2)

Author: Gerd Isenberg

Date: 08:37:28 10/01/02

Go up one level in this thread


On October 01, 2002 at 08:07:58, Vincent Diepeveen wrote:

Hi Vincent,

>On October 01, 2002 at 03:58:50, Gerd Isenberg wrote:
>
>If i may remind you the hyperbola guy said that his idea
>was just a 'thought'. Nothing concrete implemented.

Yes.

>
>Anyway, which processor did you test it at for the speed
>and were both tests within the mmx and how would it do outside
>mmx and inside the normal processor?
>

Athlon XP2100+ 512MB SDRAM
All tests with mmx-registers with a dumb loop:

	for (int i = 0; i < 1000000000; ++i) // 10**9 times
	{
		__asm	movq	mm6, [emptySquares]
		__asm	movq	mm1, [queens]
		getQueenAttacksMMX();
		__asm	movq	[queenAttacks], mm0
	}

none inlined
getRookAttacksMMX()    24 seconds
getQueenAttacksMMX()   51 seconds

The classical rotated Lookup (inlined also with mmx)
getQueenAttacks (i&63) 35 seconds
getQueenAttacks (i&1)  55!seconds

General purpose registers became interessant with hammer (eight additional) for
this algorithms. But also hammer is compatible with mmx and xmm (SSE/SSE2)
register sets. So i let the compiler his general purpose registers to use myself
the mmx- or xmm-registers for this low level stuff.

With x86-32 architecture you can hold only up to three bitboards in 32bit
registers.

One drawback with current athlon is that returning a bitboard in edx:eax is
rather expensive. It requires two "movd reg32,mmxreg" vector-path instructions.
That should become clearly better with hammer using only one "movq rax,mm0".
Another drawback with x86-32 is the probably missing 8-Byte alignment of pushed
actual bitboard parameters on the stack. Therefore it's recommend to use
	movd	   mm0, [bp]
	punpckldq  mm0, [bp+4]
instead of one
        movq       mm0, [bp]


>I mean P4 and probably future P4s have just 1 mmx register.
>not a nice thought :)

Hmm, P4 has 8 mmx and 8 xmm (128bit) registers (SSE/SEE2). I can't believe that
they will reduce their registers and make a lot of (multimedia) software
incompatible.

See you in Leiden,
Gerd


<snip>


// input:	mm1 queens
//		mm6 emptySquares
// output:      mm0 queenAttacks
//  all mmx registers changed
//==============================
void getQueenAttacksMMX()
{
	__asm
	{	// up/down Kogge-Stone
		movq	mm2, mm1	; generator upg
		movq	mm4, mm1	; generator dng
		movq	mm5, mm6	; propagator upp
		movq	mm7, mm6	; propagator dnp
		psllq	mm2, 8		; upg << 8
		psrlq	mm4, 8		; dng >> 8
		psllq	mm5, 8		; upp << 8
		psrlq	mm7, 8		; dnp >> 8
		pand	mm2, mm6	; upp & (upg << 8)
		pand	mm4, mm6	; dnp & (dng >> 8)
		por	mm2, mm1	; upg |= upp & (upg << 8)
		por	mm4, mm1	; dng |= dnp & (dng >> 8)
		pand	mm5, mm6	; upp &= upp << 8
		pand	mm7, mm6	; dnp &= dnp >> 8
		movq	mm0, mm2	; upg
		movq	mm3, mm4	; dng
		psllq	mm2, 16		; upg << 16
		psrlq	mm4, 16		; dng >> 16
		pand	mm2, mm5	; upp & (upg << 16)
		pand	mm4, mm7	; dnp & (dng >> 16)
		por	mm2, mm0	; upg |= upp & (upg << 16)
		por	mm4, mm3	; dng |= dnp & (dng >> 16)
		movq	mm0, mm5	; upp
		movq	mm3, mm7	; dnp
		psllq	mm5, 16		; upp << 16
		psrlq	mm7, 16		; dnp >> 16
		pand	mm5, mm0	; upp &= p << 16
		movq	mm0, mm2	; upg
		pand	mm7, mm3	; dnp &= p >> 16
		movq	mm3, mm4	; dng
		psrlq	mm4, 32		; dng >> 32
		psllq	mm0, 32		; upg << 32
		pand	mm0, mm5	; upp & (upg << 32)
		// right with x^(x-2)
		pcmpeqd mm5, mm5	; 0xffffffffffffffff -1
		pand	mm4, mm7	; dnp & (dng >> 32)
		pcmpeqd mm7, mm7	; 0xffffffffffffffff -1
		por	mm0, mm2	; upg |= upp & (upg << 32)
		pxor	mm5, mm6	; not empty ==> occupied
		por	mm4, mm3	; dng |= dnp & (dng >> 32)
		por	mm5, mm1	; force queens as subset of occupied
		psllq	mm0, 8		; final shift up ==> queenAttacks
		psrlq	mm4, 8		; final shift dn
		movq	mm3, mm5	; occupied
		psubb	mm5, mm1	; occupied -   rooks
		psubb	mm5, mm1	; occupied - 2*rooks
		por	mm0, mm4	; queenAttacks |= down
		pxor	mm4, mm4	; 0x0000000000000000
		psubb	mm4, mm7	; 0x0101010101010101 0-(-1)
		pxor	mm5, mm3 ; right := occupied ^ (occupied - 2*rooks)
		// diagonals and left dumb7fill
		movq	mm2, mm1	; leftup
		psubb	mm7, mm4	; 0xfefefefefefefefe notA 0xff-0x01
		por	mm0, mm5	; queenAttacks |= right
		movq	mm5, mm7	; 0xfefefefefefefefe notA
		pand	mm2, mm7	; clear left occupied or a file
		psrlq	mm5, 1		; 0x7f7f7f7f7f7f7f7f notH
		movq	mm3, mm2	; leftdown
		pand	mm1, mm5	; clear right occupied or h file
		pand	mm7, mm6	; to clear left occupied or a-file
		pand	mm6, mm5	; to clear right occupied or h-file
		movq	mm4, mm1	; rightdown
		movq	mm5, mm2	; left
		// 1. fill diagonals and left
		psllq	mm1, 9		; rightup
		psrlq	mm4, 7		; rightdown
		psllq	mm2, 7		; leftup
		psrlq	mm3, 9		; leftdown
		psrlq	mm5, 1		; left
		por	mm0, mm1	; queenAttacks |= rightup
		por	mm0, mm4	; queenAttacks |= rightdown
		por	mm0, mm2	; queenAttacks |= leftup
		por	mm0, mm3	; queenAttacks |= leftdown
		por	mm0, mm5	; queenAttacks |= left
		pand	mm1, mm6	; clear rightup occupied or h file
		pand	mm4, mm6	; clear rightdown occupied or h file
		pand	mm2, mm7	; clear leftup occupied or a file
		pand	mm3, mm7	; clear leftdown occupied or a file
		pand	mm5, mm7	; clear left occupied or a file
		// 2. fill diagonals and left
		psllq	mm1, 9		; rightup
		psrlq	mm4, 7		; rightdown
		psllq	mm2, 7		; leftup
		psrlq	mm3, 9		; leftdown
		psrlq	mm5, 1		; left
		por	mm0, mm1	; queenAttacks |= rightup
		por	mm0, mm4	; queenAttacks |= rightdown
		por	mm0, mm2	; queenAttacks |= leftup
		por	mm0, mm3	; queenAttacks |= leftdown
		por	mm0, mm5	; queenAttacks |= left
		pand	mm1, mm6	; clear rightup occupied or h file
		pand	mm4, mm6	; clear rightdown occupied or h file
		pand	mm2, mm7	; clear leftup occupied or a file
		pand	mm3, mm7	; clear leftdown occupied or a file
		pand	mm5, mm7	; clear left occupied or a file
		// 3. fill diagonals and left
		psllq	mm1, 9		; rightup
		psrlq	mm4, 7		; rightdown
		psllq	mm2, 7		; leftup
		psrlq	mm3, 9		; leftdown
		psrlq	mm5, 1		; left
		por	mm0, mm1	; queenAttacks |= rightup
		por	mm0, mm4	; queenAttacks |= rightdown
		por	mm0, mm2	; queenAttacks |= leftup
		por	mm0, mm3	; queenAttacks |= leftdown
		por	mm0, mm5	; queenAttacks |= left
		pand	mm1, mm6	; clear rightup occupied or h file
		pand	mm4, mm6	; clear rightdown occupied or h file
		pand	mm2, mm7	; clear leftup occupied or a file
		pand	mm3, mm7	; clear leftdown occupied or a file
		pand	mm5, mm7	; clear left occupied or a file
		// 4. fill diagonals and left
		psllq	mm1, 9		; rightup
		psrlq	mm4, 7		; rightdown
		psllq	mm2, 7		; leftup
		psrlq	mm3, 9		; leftdown
		psrlq	mm5, 1		; left
		por	mm0, mm1	; queenAttacks |= rightup
		por	mm0, mm4	; queenAttacks |= rightdown
		por	mm0, mm2	; queenAttacks |= leftup
		por	mm0, mm3	; queenAttacks |= leftdown
		por	mm0, mm5	; queenAttacks |= left
		pand	mm1, mm6	; clear rightup occupied or h file
		pand	mm4, mm6	; clear rightdown occupied or h file
		pand	mm2, mm7	; clear leftup occupied or a file
		pand	mm3, mm7	; clear leftdown occupied or a file
		pand	mm5, mm7	; clear left occupied or a file
		// 5. fill diagonals and left
		psllq	mm1, 9		; rightup
		psrlq	mm4, 7		; rightdown
		psllq	mm2, 7		; leftup
		psrlq	mm3, 9		; leftdown
		psrlq	mm5, 1		; left
		por	mm0, mm1	; queenAttacks |= rightup
		por	mm0, mm4	; queenAttacks |= rightdown
		por	mm0, mm2	; queenAttacks |= leftup
		por	mm0, mm3	; queenAttacks |= leftdown
		por	mm0, mm5	; queenAttacks |= left
		pand	mm1, mm6	; clear rightup occupied or h file
		pand	mm4, mm6	; clear rightdown occupied or h file
		pand	mm2, mm7	; clear leftup occupied or a file
		pand	mm3, mm7	; clear leftdown occupied or a file
		pand	mm5, mm7	; clear left occupied or a file
		// 6. fill diagonals and left
		psllq	mm1, 9		; rightup
		psrlq	mm4, 7		; rightdown
		psllq	mm2, 7		; leftup
		psrlq	mm3, 9		; leftdown
		psrlq	mm5, 1		; left
		por	mm0, mm1	; queenAttacks |= rightup
		por	mm0, mm4	; queenAttacks |= rightdown
		por	mm0, mm2	; queenAttacks |= leftup
		por	mm0, mm3	; queenAttacks |= leftdown
		por	mm0, mm5	; queenAttacks |= left
		pand	mm1, mm6	; clear rightup occupied or h file
		pand	mm4, mm6	; clear rightdown occupied or h file
		pand	mm2, mm7	; clear leftup occupied or a file
		pand	mm3, mm7	; clear leftdown occupied or a file
		pand	mm5, mm7	; clear left occupied or a file
		// 7. fill diagonals and left
		psllq	mm1, 9		; rightup
		psrlq	mm4, 7		; rightdown
		psllq	mm2, 7		; leftup
		psrlq	mm3, 9		; leftdown
		psrlq	mm5, 1		; left
		por	mm0, mm1	; queenAttacks |= rightup
		por	mm0, mm4	; queenAttacks |= rightdown
		por	mm0, mm2	; queenAttacks |= leftup
		por	mm0, mm3	; queenAttacks |= leftdown
		por	mm0, mm5	; queenAttacks |= left
	}
}


The classical rotated bitboard approach i compared with:


#define BITBOARD_ALIGN __declspec(align(8))

BitBoard BITBOARD_ALIGN sFileAtta[64][64]; // [sq][innerOccupiedstate]
BitBoard sRankAtta[64][64];
BitBoard sA1H8Atta[64][64];
BitBoard sH1A8Atta[64][64];

BitBoard OccuBB90 = 0;
BitBoard OccuBB00 = 0;
BitBoard OccuBBA1H8 = 0;
BitBoard OccuBBH1A8 = 0;


// output   mm0 queenAttacks
//==========================
void getQueenAttacks (int sq)
{
	__asm
	{
		mov		eax, sq
		mov		edx, eax
		mov		ecx, eax
		and		edx, 7			; file
		shr		ecx, 3			; rank
		mov		edi, ecx		; rank to edi
		shl		eax, 9

		movzx	edx, [OccuBB90 + edx]; state on file
		movzx	ecx, [OccuBB00 + ecx]; state on rank
		and	edx, 0x7e
		and	ecx, 0x7e
		movq	mm0, [sFileAtta + eax + edx*4]
		por	mm0, [sRankAtta + eax + ecx*4]

		mov	edx, eax
		shr	edx, 9	 ; square = file here because of "and 7"
		mov	ecx, edx
		sub	edx, edi ; file - rank
		not	ecx
		sub	ecx, edi ; ~file - rank
		and	edx, 7
		and	ecx, 7

		movzx	edx, [OccuBBA1H8 + edx]
		movzx	ecx, [OccuBBH1A8 + ecx]
		and	edx, 0x7e
		and	ecx, 0x7e
		por	mm0, [sA1H8Atta + eax + edx*4]
		por	mm0, [sH1A8Atta + eax + ecx*4]
	}
}





This page took 0 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.