Computer Chess Club Archives

Search

Terms
Messages

Subject: direction fill vs. Kogge-Stone

Author: Gerd Isenberg
Date: 15:28:29 09/16/02
Hi Steffan,

I think, i have it now. Thanks again :-)
Kogge-Stone parallel prefix algorithm - really fascinating!

But after trying an (may be not perfect) mmx-implementation surprisingly the
(dumb) direction fill routine seems to win, at least with mmx!

getRookAttacks	KoggeStoneMMX	DumbMMX
movs/etc         36 (29)	 13*(6)	*two performance
shifts/paddb	 26*		 28	*two paddb per << 2
and		 25		 31
or		 15		 27
total		 95		 92
asm byte size   350		317

The direction fill has even less register dependencies and with enough integer
pipes four directions may processed simultaniously in each of the seven
iterations, like four special purpose shift/and-registers doing one iteration
per cycle.

So disappointed about it, but Kogge-Stone needs too many intermediate results.
The moves are so expensive as the logicals (at least in x86s), in code size and
in execution speed.

Cheers,
Gerd



compare yourself, somethig to improve?


BitBoard getRookAttacksByKoggeStoneMMX(BitBoard freeSquares, BitBoard rooks)
{
	static const BitBoard EFGH  = 0xf0f0f0f0f0f0f0f0;
	static const BitBoard notH  = 0x7f7f7f7f7f7f7f7f;
	static const BitBoard notGH = 0x3f3f3f3f3f3f3f3f;
	static const BitBoard ABCD  = 0x0f0f0f0f0f0f0f0f;
	__asm
	{
		movd	mm6,   [freeSquares]
		punpckldq mm6, [freeSquares+4]
		movd	mm1,   [rooks]
		punpckldq mm1, [rooks+4]

		// up
		movq	mm3, mm1	; generator g
		movq	mm5, mm6	; propagator p
		psllq	mm3, 8		; g << 8
		movq	mm4, mm6	; p
		psllq	mm5, 8		; p << 8
		movq	mm2, mm1	; g
		pand	mm3, mm4	; p & (g << 8)
		por	mm2, mm3	; g |= p & (g << 8)
		pand	mm4, mm5	; p &= (p << 8)

		movq	mm3, mm2	; g
		movq	mm5, mm4	; p
		psllq	mm3, 16		; g << 16
		psllq	mm5, 16		; p << 16
		pand	mm3, mm4	; p & (g << 16)
		por	mm2, mm3	; g | p & (g << 16)
		pand	mm4, mm5	; p & (p << 16)

		movq	mm3, mm2	; g
		psllq	mm3, 32		; g << 32
		pand	mm3, mm4	; p & (g << 32)
		por	mm2, mm3	; g | p & (g << 32)
		psllq	mm2, 8		; final shift up
		movq	mm0, mm2	; save up so far

		// down
		movq	mm3, mm1	; generator g
		movq	mm5, mm6	; propagator p
		psrlq	mm3, 8		; g >> 8
		movq	mm4, mm6	; p
		psrlq	mm5, 8		; p >> 8
		movq	mm2, mm1	; g
		pand	mm3, mm4	; p & (g >> 8)
		por	mm2, mm3	; g |= p & (g >> 8)
		pand	mm4, mm5	; p &= (p >> 8)

		movq	mm3, mm2	; g
		movq	mm5, mm4	; p
		psrlq	mm3, 16		; g >> 16
		psrlq	mm5, 16		; p >> 16
		pand	mm3, mm4	; p & (g >> 16)
		por	mm2, mm3	; g |= p & (g >> 16)
		pand	mm4, mm5	; p &= (p >> 16)

		movq	mm3, mm2	; g
		psrlq	mm3, 32		; g >> 32
		pand	mm3, mm4	; p & (g >> 32)
		por	mm2, mm3	; g | p & (g >> 32)
		psrlq	mm2, 8		; final shift down
		por	mm0, mm2	; to final result

		// right
		movq	mm3, mm1	; generator g
		movq	mm5, mm6	; propagator p
		paddb	mm3, mm3	; g << 1
		movq	mm4, mm6	; p
		paddb	mm5, mm5	; p << 1
		movq	mm2, mm1	; g
		pand	mm3, mm4	; p & (g << 1)
		por	mm2, mm3	; g |= p & (g << 1)
		pand	mm4, mm5	; p &= (p << 1)

		movq	mm3, mm2	; g
		movq	mm5, mm4	; p
		paddb	mm3, mm3	; g << 1
		paddb	mm5, mm5	; p << 1
		paddb	mm3, mm3	; g << 2
		paddb	mm5, mm5	; p << 2
		pand	mm3, mm4	; p & (g << 2)
		por	mm2, mm3	; g |= p & (g << 2)
		pand	mm4, mm5	; p &= (p << 2)

		movq	mm3, mm2	; g
		pand	mm4, [EFGH]	; considers wraps
		psllq	mm3, 4		; (g << 4)
		pand	mm3, mm4	; p & (g << 4)
		por	mm2, mm3	; g | p & (g << 4)
		paddb	mm2, mm2	; final shift right
		por	mm0, mm2	; to final result

		// left
		movq	mm3, mm1	; generator g
		movq	mm5, mm6	; propagator p
		psrlq	mm3, 1		; g >> 1
		movq	mm4, mm6	; p
		psrlq	mm5, 1		; p >> 1
		pand	mm4, [notH]	; considers wraps
		movq	mm2, mm1	; g
		pand	mm3, mm4	; p & (g >> 1)
		por	mm2, mm3	; g |= p & (g >> 1)
		pand	mm4, mm5	; p &= (p >> 1)

		movq	mm3, mm2	; g
		movq	mm5, mm4	; p
		pand	mm4, [notGH]	; considers wraps
		psrlq	mm3, 2		; g >> 2
		psrlq	mm5, 2		; p >> 2
		pand	mm3, mm4	; p & (g >> 2)
		por	mm2, mm3	; g |= p & (g >> 2)
		pand	mm4, mm5	; p &= (p >> 2)

		movq	mm3, mm2	; g
		pand	mm4, [ABCD]	; considers wraps
		psrlq	mm3, 4		; g >> 4
		pand	mm3, mm4	; p & (g >> 4)
		por	mm2, mm3	; g |= p & (g >> 4)
		psrlq	mm2, 1		; final shift left
		pand	mm2, [notH]	; but no wrap
		por	mm0, mm2	; to final result

		pswapd	mm1, mm0
		movd	eax, mm0
		movd	edx, mm1
	}
}

// dumb but shorter and faster
BitBoard getRookAttacksMMX(BitBoard freeSquares, BitBoard rooks)
{
	static const BitBoard eight = 8;
	static const BitBoard notH = 0x7f7f7f7f7f7f7f7f;
	__asm
	{
		movd	mm6,   [freeSquares]
		punpckldq mm6, [freeSquares+4]
		movd	mm1,   [rooks]
		punpckldq mm1, [rooks+4]

		movq	mm5, [eight]; saves some bytes in unrolled loop
		movq	mm7, [notH]	; saves some bytes in unrolled loop
		movq	mm2, mm1	; left
		movq	mm3, mm1	; up
		movq	mm4, mm1	; down
		// 1. straight fill in each direction
		paddb	mm1, mm1	; right
		psrlq	mm2, 1		; left
		psllq	mm3, mm5	; up
		movq	mm0, mm1	; rookAttacks = right
		pand	mm2, mm7	; clear left h-file wraps
		pand	mm1, mm6	; clear right occupied
		por	mm0, mm3	; rookAttacks |= up
		psrlq	mm4, mm5	; down
		por	mm0, mm2	; rookAttacks |= left
		pand	mm3, mm6	; clear up occupied
		por	mm0, mm4	; rookAttacks |= down
		pand	mm2, mm6	; clear left occupied
		pand	mm4, mm6	; clear down occupied
		// 2. straight fill in each direction
		paddb	mm1, mm1	; right
		psrlq	mm2, 1		; left
		psllq	mm3, mm5	; up
		por	mm0, mm1	; rookAttacks |= right
		pand	mm2, mm7	; clear left h-file wraps
		pand	mm1, mm6	; clear right occupied
		por	mm0, mm3	; rookAttacks |= up
		psrlq	mm4, mm5	; down
		por	mm0, mm2	; rookAttacks |= left
		pand	mm3, mm6	; clear up occupied
		por	mm0, mm4	; rookAttacks |= down
		pand	mm2, mm6	; clear left occupied
		pand	mm4, mm6	; clear down occupied
		// 3. straight fill in each direction
		paddb	mm1, mm1	; right
		psrlq	mm2, 1		; left
		psllq	mm3, mm5	; up
		por	mm0, mm1	; rookAttacks |= right
		pand	mm2, mm7	; clear left h-file wraps
		pand	mm1, mm6	; clear right occupied
		por	mm0, mm3	; rookAttacks |= up
		psrlq	mm4, mm5	; down
		por	mm0, mm2	; rookAttacks |= left
		pand	mm3, mm6	; clear up occupied
		por	mm0, mm4	; rookAttacks |= down
		pand	mm2, mm6	; clear left occupied
		pand	mm4, mm6	; clear down occupied
		// 4. straight fill in each direction
		paddb	mm1, mm1	; right
		psrlq	mm2, 1		; left
		psllq	mm3, mm5	; up
		por	mm0, mm1	; rookAttacks |= right
		pand	mm2, mm7	; clear left h-file wraps
		pand	mm1, mm6	; clear right occupied
		por	mm0, mm3	; rookAttacks |= up
		psrlq	mm4, mm5	; down
		por	mm0, mm2	; rookAttacks |= left
		pand	mm3, mm6	; clear up occupied
		por	mm0, mm4	; rookAttacks |= down
		pand	mm2, mm6	; clear left occupied
		pand	mm4, mm6	; clear down occupied
		// 5. straight fill in each direction
		paddb	mm1, mm1	; right
		psrlq	mm2, 1		; left
		psllq	mm3, mm5	; up
		por	mm0, mm1	; rookAttacks |= right
		pand	mm2, mm7	; clear left h-file wraps
		pand	mm1, mm6	; clear right occupied
		por	mm0, mm3	; rookAttacks |= up
		psrlq	mm4, mm5	; down
		por	mm0, mm2	; rookAttacks |= left
		pand	mm3, mm6	; clear up occupied
		por	mm0, mm4	; rookAttacks |= down
		pand	mm2, mm6	; clear left occupied
		pand	mm4, mm6	; clear down occupied
		// 6. straight fill in each direction
		paddb	mm1, mm1	; right
		psrlq	mm2, 1		; left
		psllq	mm3, mm5	; up
		por	mm0, mm1	; rookAttacks |= right
		pand	mm2, mm7	; clear left h-file wraps
		pand	mm1, mm6	; clear right occupied
		por	mm0, mm3	; rookAttacks |= up
		psrlq	mm4, mm5	; down
		por	mm0, mm2	; rookAttacks |= left
		pand	mm3, mm6	; clear up occupied
		por	mm0, mm4	; rookAttacks |= down
		pand	mm2, mm6	; clear left occupied
		pand	mm4, mm6	; clear down occupied
		// 7. straight fill in each direction
		paddb	mm1, mm1	; right
		psrlq	mm2, 1		; left
		psllq	mm3, mm5	; up
		por	mm0, mm1	; rookAttacks |= right
		pand	mm2, mm7	; clear left h-file wraps
		por	mm0, mm3	; rookAttacks |= up
		psrlq	mm4, mm5	; down
		por	mm0, mm2	; rookAttacks |= left
		por	mm0, mm4	; rookAttacks |= down

		pswapd	mm1, mm0
		movd	eax, mm0
		movd	edx, mm1
	}
}
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.