Author: Gerd Isenberg
Date: 15:28:29 09/16/02
Go up one level in this thread
Hi Steffan,
I think, i have it now. Thanks again :-)
Kogge-Stone parallel prefix algorithm - really fascinating!
But after trying an (may be not perfect) mmx-implementation surprisingly the
(dumb) direction fill routine seems to win, at least with mmx!
getRookAttacks KoggeStoneMMX DumbMMX
movs/etc 36 (29) 13*(6) *two performance
shifts/paddb 26* 28 *two paddb per << 2
and 25 31
or 15 27
total 95 92
asm byte size 350 317
The direction fill has even less register dependencies and with enough integer
pipes four directions may processed simultaniously in each of the seven
iterations, like four special purpose shift/and-registers doing one iteration
per cycle.
So disappointed about it, but Kogge-Stone needs too many intermediate results.
The moves are so expensive as the logicals (at least in x86s), in code size and
in execution speed.
Cheers,
Gerd
compare yourself, somethig to improve?
BitBoard getRookAttacksByKoggeStoneMMX(BitBoard freeSquares, BitBoard rooks)
{
static const BitBoard EFGH = 0xf0f0f0f0f0f0f0f0;
static const BitBoard notH = 0x7f7f7f7f7f7f7f7f;
static const BitBoard notGH = 0x3f3f3f3f3f3f3f3f;
static const BitBoard ABCD = 0x0f0f0f0f0f0f0f0f;
__asm
{
movd mm6, [freeSquares]
punpckldq mm6, [freeSquares+4]
movd mm1, [rooks]
punpckldq mm1, [rooks+4]
// up
movq mm3, mm1 ; generator g
movq mm5, mm6 ; propagator p
psllq mm3, 8 ; g << 8
movq mm4, mm6 ; p
psllq mm5, 8 ; p << 8
movq mm2, mm1 ; g
pand mm3, mm4 ; p & (g << 8)
por mm2, mm3 ; g |= p & (g << 8)
pand mm4, mm5 ; p &= (p << 8)
movq mm3, mm2 ; g
movq mm5, mm4 ; p
psllq mm3, 16 ; g << 16
psllq mm5, 16 ; p << 16
pand mm3, mm4 ; p & (g << 16)
por mm2, mm3 ; g | p & (g << 16)
pand mm4, mm5 ; p & (p << 16)
movq mm3, mm2 ; g
psllq mm3, 32 ; g << 32
pand mm3, mm4 ; p & (g << 32)
por mm2, mm3 ; g | p & (g << 32)
psllq mm2, 8 ; final shift up
movq mm0, mm2 ; save up so far
// down
movq mm3, mm1 ; generator g
movq mm5, mm6 ; propagator p
psrlq mm3, 8 ; g >> 8
movq mm4, mm6 ; p
psrlq mm5, 8 ; p >> 8
movq mm2, mm1 ; g
pand mm3, mm4 ; p & (g >> 8)
por mm2, mm3 ; g |= p & (g >> 8)
pand mm4, mm5 ; p &= (p >> 8)
movq mm3, mm2 ; g
movq mm5, mm4 ; p
psrlq mm3, 16 ; g >> 16
psrlq mm5, 16 ; p >> 16
pand mm3, mm4 ; p & (g >> 16)
por mm2, mm3 ; g |= p & (g >> 16)
pand mm4, mm5 ; p &= (p >> 16)
movq mm3, mm2 ; g
psrlq mm3, 32 ; g >> 32
pand mm3, mm4 ; p & (g >> 32)
por mm2, mm3 ; g | p & (g >> 32)
psrlq mm2, 8 ; final shift down
por mm0, mm2 ; to final result
// right
movq mm3, mm1 ; generator g
movq mm5, mm6 ; propagator p
paddb mm3, mm3 ; g << 1
movq mm4, mm6 ; p
paddb mm5, mm5 ; p << 1
movq mm2, mm1 ; g
pand mm3, mm4 ; p & (g << 1)
por mm2, mm3 ; g |= p & (g << 1)
pand mm4, mm5 ; p &= (p << 1)
movq mm3, mm2 ; g
movq mm5, mm4 ; p
paddb mm3, mm3 ; g << 1
paddb mm5, mm5 ; p << 1
paddb mm3, mm3 ; g << 2
paddb mm5, mm5 ; p << 2
pand mm3, mm4 ; p & (g << 2)
por mm2, mm3 ; g |= p & (g << 2)
pand mm4, mm5 ; p &= (p << 2)
movq mm3, mm2 ; g
pand mm4, [EFGH] ; considers wraps
psllq mm3, 4 ; (g << 4)
pand mm3, mm4 ; p & (g << 4)
por mm2, mm3 ; g | p & (g << 4)
paddb mm2, mm2 ; final shift right
por mm0, mm2 ; to final result
// left
movq mm3, mm1 ; generator g
movq mm5, mm6 ; propagator p
psrlq mm3, 1 ; g >> 1
movq mm4, mm6 ; p
psrlq mm5, 1 ; p >> 1
pand mm4, [notH] ; considers wraps
movq mm2, mm1 ; g
pand mm3, mm4 ; p & (g >> 1)
por mm2, mm3 ; g |= p & (g >> 1)
pand mm4, mm5 ; p &= (p >> 1)
movq mm3, mm2 ; g
movq mm5, mm4 ; p
pand mm4, [notGH] ; considers wraps
psrlq mm3, 2 ; g >> 2
psrlq mm5, 2 ; p >> 2
pand mm3, mm4 ; p & (g >> 2)
por mm2, mm3 ; g |= p & (g >> 2)
pand mm4, mm5 ; p &= (p >> 2)
movq mm3, mm2 ; g
pand mm4, [ABCD] ; considers wraps
psrlq mm3, 4 ; g >> 4
pand mm3, mm4 ; p & (g >> 4)
por mm2, mm3 ; g |= p & (g >> 4)
psrlq mm2, 1 ; final shift left
pand mm2, [notH] ; but no wrap
por mm0, mm2 ; to final result
pswapd mm1, mm0
movd eax, mm0
movd edx, mm1
}
}
// dumb but shorter and faster
BitBoard getRookAttacksMMX(BitBoard freeSquares, BitBoard rooks)
{
static const BitBoard eight = 8;
static const BitBoard notH = 0x7f7f7f7f7f7f7f7f;
__asm
{
movd mm6, [freeSquares]
punpckldq mm6, [freeSquares+4]
movd mm1, [rooks]
punpckldq mm1, [rooks+4]
movq mm5, [eight]; saves some bytes in unrolled loop
movq mm7, [notH] ; saves some bytes in unrolled loop
movq mm2, mm1 ; left
movq mm3, mm1 ; up
movq mm4, mm1 ; down
// 1. straight fill in each direction
paddb mm1, mm1 ; right
psrlq mm2, 1 ; left
psllq mm3, mm5 ; up
movq mm0, mm1 ; rookAttacks = right
pand mm2, mm7 ; clear left h-file wraps
pand mm1, mm6 ; clear right occupied
por mm0, mm3 ; rookAttacks |= up
psrlq mm4, mm5 ; down
por mm0, mm2 ; rookAttacks |= left
pand mm3, mm6 ; clear up occupied
por mm0, mm4 ; rookAttacks |= down
pand mm2, mm6 ; clear left occupied
pand mm4, mm6 ; clear down occupied
// 2. straight fill in each direction
paddb mm1, mm1 ; right
psrlq mm2, 1 ; left
psllq mm3, mm5 ; up
por mm0, mm1 ; rookAttacks |= right
pand mm2, mm7 ; clear left h-file wraps
pand mm1, mm6 ; clear right occupied
por mm0, mm3 ; rookAttacks |= up
psrlq mm4, mm5 ; down
por mm0, mm2 ; rookAttacks |= left
pand mm3, mm6 ; clear up occupied
por mm0, mm4 ; rookAttacks |= down
pand mm2, mm6 ; clear left occupied
pand mm4, mm6 ; clear down occupied
// 3. straight fill in each direction
paddb mm1, mm1 ; right
psrlq mm2, 1 ; left
psllq mm3, mm5 ; up
por mm0, mm1 ; rookAttacks |= right
pand mm2, mm7 ; clear left h-file wraps
pand mm1, mm6 ; clear right occupied
por mm0, mm3 ; rookAttacks |= up
psrlq mm4, mm5 ; down
por mm0, mm2 ; rookAttacks |= left
pand mm3, mm6 ; clear up occupied
por mm0, mm4 ; rookAttacks |= down
pand mm2, mm6 ; clear left occupied
pand mm4, mm6 ; clear down occupied
// 4. straight fill in each direction
paddb mm1, mm1 ; right
psrlq mm2, 1 ; left
psllq mm3, mm5 ; up
por mm0, mm1 ; rookAttacks |= right
pand mm2, mm7 ; clear left h-file wraps
pand mm1, mm6 ; clear right occupied
por mm0, mm3 ; rookAttacks |= up
psrlq mm4, mm5 ; down
por mm0, mm2 ; rookAttacks |= left
pand mm3, mm6 ; clear up occupied
por mm0, mm4 ; rookAttacks |= down
pand mm2, mm6 ; clear left occupied
pand mm4, mm6 ; clear down occupied
// 5. straight fill in each direction
paddb mm1, mm1 ; right
psrlq mm2, 1 ; left
psllq mm3, mm5 ; up
por mm0, mm1 ; rookAttacks |= right
pand mm2, mm7 ; clear left h-file wraps
pand mm1, mm6 ; clear right occupied
por mm0, mm3 ; rookAttacks |= up
psrlq mm4, mm5 ; down
por mm0, mm2 ; rookAttacks |= left
pand mm3, mm6 ; clear up occupied
por mm0, mm4 ; rookAttacks |= down
pand mm2, mm6 ; clear left occupied
pand mm4, mm6 ; clear down occupied
// 6. straight fill in each direction
paddb mm1, mm1 ; right
psrlq mm2, 1 ; left
psllq mm3, mm5 ; up
por mm0, mm1 ; rookAttacks |= right
pand mm2, mm7 ; clear left h-file wraps
pand mm1, mm6 ; clear right occupied
por mm0, mm3 ; rookAttacks |= up
psrlq mm4, mm5 ; down
por mm0, mm2 ; rookAttacks |= left
pand mm3, mm6 ; clear up occupied
por mm0, mm4 ; rookAttacks |= down
pand mm2, mm6 ; clear left occupied
pand mm4, mm6 ; clear down occupied
// 7. straight fill in each direction
paddb mm1, mm1 ; right
psrlq mm2, 1 ; left
psllq mm3, mm5 ; up
por mm0, mm1 ; rookAttacks |= right
pand mm2, mm7 ; clear left h-file wraps
por mm0, mm3 ; rookAttacks |= up
psrlq mm4, mm5 ; down
por mm0, mm2 ; rookAttacks |= left
por mm0, mm4 ; rookAttacks |= down
pswapd mm1, mm0
movd eax, mm0
movd edx, mm1
}
}
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.