Computer Chess Club Archives


Search

Terms

Messages

Subject: Re: MMX-Kogge-Strone gains the lead

Author: Alessandro Damiani

Date: 12:05:43 09/24/02

Go up one level in this thread


On September 24, 2002 at 12:43:21, Gerd Isenberg wrote:

>Hi all,
>
>few days ago i posted some optimized direction fill routines (dumb7fill),
>to generate attack bitboards for multiple (sliding) pieces:
>
>http://www.talkchess.com/forums/1/message.html?253039
>http://www.talkchess.com/forums/1/message.html?253146
>
>After a few "factoring outs" and some reordering of mmx-instructions, my
>conclusion was "hard time for Kogge-Stone", because the dumb7fill needs only few
>registers, so that is was possible to do four directions simultaniously without
>reading any constant datas.
>
>Since Kogge-Stone is preferable under algorithm point of view, because only
>three (and one final shift) instead of seven fill iterations per direction,
>the drawback of Kogge-Stone is the need of more registers because generator
>and propagator have to be shifted, and to store intermediate generators and
>propagators.
>
>Sample C- Source of "Kogge-Stone parallel prefix algorithm" inroduced to me
>by Steffan Westcott:
>
>BitBoard FillUpOccluded(BitBoard g, BitBoard p)
>{
>	g |= p & (g << 8);
>	p &= (p << 8);
>	g |= p & (g << 16);
>	p &= (p << 16);
>	return g |= p & (g << 32);
>}
>
>BitBoard RookAttacksUp(BitBoard rooks, BitBoard empty_squares)
>{
>	return ShiftUp(FillUpOccluded(rooks, empty_squares));
>}
>
>See Steffan Westcotts reply to my post:
>http://www.talkchess.com/forums/1/message.html?252198
>http://www.talkchess.com/forums/1/message.html?252289
>
>I was so ambitious doing a smarter Kogge-Stone mmx-implementation by
>introducing explicit parallelism to fill two directions simultaniously.
>
>This Kogge-Stone mmx-implemetation for rooks, is slightly faster than
>dumb7fill, (26 verus 27 seconds for 10**9 calls) although it's few bytes
>longer and reads some constant, cache friendly data.
>
>I am quite sure, that Kogge-Stone will gain more from amd's hammer. Hammer
>will have sixteen (14 if we exclude RSP and RBP) general purpose 64bit
>registers, eight 64bit mmx-registers (shared with x87) and eight 128bit
>xmm-registers! It should be possible to fill at least four directions
>independendly, without saving too many of the gp-registers.
>
>cheers,
>Gerd
>

Hi Gerd!

Did I get it right that this method is able to generate the attacks by one rook
or all the rooks at once, just by passing the appropriate Bitboard as a
parameter?

Cheers,
Alessandro




>
>#define CACHE_ALIGN __declspec(align(32))
>
>// input:	mm1 rooks
>//		mm6 freeSquares (!changed)
>// output:	mm0 rookattacks
>//========================================
>void getRookAttacksByKoggeStoneMMX()
>{
>	static const struct
>	{
>		BitBoard notA;
>		BitBoard notAH;
>		BitBoard EFGH;
>		BitBoard ABCD;
>	} CACHE_ALIGN masks =
>	{
>		0xfefefefefefefefe,
>		0x7e7e7e7e7e7e7e7e,
>		0xf0f0f0f0f0f0f0f0,
>		0x0f0f0f0f0f0f0f0f
>	};
>	__asm
>	{
>		// up/down up/dn Kogge Stone
>		movq	mm2, mm1	; generator upg
>		movq	mm4, mm1	; generator dng
>		movq	mm5, mm6	; propagator upp
>		movq	mm7, mm6	; propagator dnp
>		psllq	mm2, 8		; upg << 8
>		psrlq	mm4, 8		; dng >> 8
>		psllq	mm5, 8		; upp << 8
>		psrlq	mm7, 8		; dnp >> 8
>		pand	mm2, mm6	; upp & (upg << 8)
>		pand	mm4, mm6	; dnp & (dng >> 8)
>		pand	mm5, mm6	; upp &= upp << 8
>		pand	mm7, mm6	; dnp &= dnp >> 8
>		por	mm2, mm1	; upg |= upp & (upg << 8)
>		por	mm4, mm1	; dng |= dnp & (dng >> 8)
>
>		movq	mm0, mm2	; upg
>		movq	mm3, mm4	; dng
>		psllq	mm2, 16		; upg << 16
>		psrlq	mm4, 16		; dng >> 16
>		pand	mm2, mm5	; upp & (upg << 16)
>		pand	mm4, mm7	; dnp & (dng >> 16)
>		por	mm2, mm0	; upg |= upp & (upg << 16)
>		por	mm4, mm3	; dng |= dnp & (dng >> 16)
>		movq	mm0, mm5	; upp
>		movq	mm3, mm7	; dnp
>		psllq	mm5, 16		; upp << 16
>		psrlq	mm7, 16		; dnp >> 16
>		pand	mm5, mm0	; upp &= p << 16
>		pand	mm7, mm3	; dnp &= p >> 16
>
>		movq	mm0, mm2	; upg
>		movq	mm3, mm4	; dng
>		psllq	mm0, 32		; upg << 32
>		psrlq	mm4, 32		; dng >> 32
>		lea	eax, [masks]
>		pand	mm0, mm5	; upp & (upg << 32)
>		pand	mm4, mm7	; dnp & (dng >> 32)
>		por	mm0, mm2	; upg |= upp & (upg << 32)
>		por	mm4, mm3	; dng |= dnp & (dng >> 32)
>		movq	mm5, mm6	; propagator rip
>		psllq	mm0, 8		; final shift up
>		psrlq	mm4, 8		; final shift dn
>		movq	mm2, mm1	; generator rig
>		paddb	mm5, mm5	; rip << 1
>		por	mm0, mm4
>
>		// right /left ri/le Kogge Stone
>		movq	mm4, [eax.notA]
>		movq	mm7, [eax.notAH]
>		pand	mm4, mm1	; generator leg, but no wrap
>		pand	mm7, mm6	; propagator lep, but no wrap
>		paddb	mm2, mm2	; rig << 1
>		psrlq	mm4, 1		; leg >> 1
>		psrlq	mm7, 1		; lep >> 1
>		pand	mm2, mm6	; rip & (rig << 1)
>		pand	mm4, mm6	; lep & (leg >> 1)
>		pand	mm5, mm6	; rip &= rip << 1
>		pand	mm7, mm6	; lep &= lep >> 1
>		por	mm2, mm1	; rig |= rip & (rig >> 1)
>		por	mm4, mm1	; leg |= lep & (leg >> 1)
>
>		movq	mm1, mm2	; rig
>		movq	mm3, mm4	; leg
>		paddb	mm1, mm2	; rig << 1
>		psrlq	mm4, 2		; leg >> 2
>		paddb	mm1, mm1	; rig << 2
>		pand	mm4, mm7	; lep & (leg >> 2)
>		pand	mm1, mm5	; rip & (rig << 2)
>		por	mm1, mm2	; rig |= rip & (rig << 2)
>		por	mm4, mm3	; leg |= lep & (leg >> 2)
>		movq	mm2, [eax.EFGH]	; but no wrap for rip & (rig << 4)
>		movq	mm3, [eax.ABCD]	; but no wrap for lep & (leg >> 4)
>		pand	mm2, mm5	; rip
>		pand	mm3, mm7	; lep
>		paddb	mm5, mm5	; rip << 1
>		psrlq	mm7, 2		; lep >> 2
>		paddb	mm5, mm5	; rip << 2
>		pand	mm7, mm3	; lep &= lep >> 2
>		pand	mm5, mm2	; rip &= rip << 2
>
>		movq	mm3, mm4	; leg
>		movq	mm2, mm1	; rig
>		psrlq	mm4, 4		; leg >> 4
>		psllq	mm1, 4		; rig << 4
>		pand	mm4, mm7	; lep & (leg >> 4)
>		pand	mm1, mm5	; rip & (rig << 4)
>		por	mm4, mm3	; leg |= lep & (leg >> 4)
>		por	mm1, mm2	; rig |= rip & (rig << 4)
>		pand	mm4, [eax.notA]	; but no wrap
>		paddb	mm1, mm1	; final shift rigth
>		psrlq	mm4, 1		; final shift left
>		por	mm0, mm1
>		por	mm0, mm4
>	}
>}



This page took 0 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.