Author: Gerd Isenberg
Date: 08:37:28 10/01/02
Go up one level in this thread
On October 01, 2002 at 08:07:58, Vincent Diepeveen wrote:
Hi Vincent,
>On October 01, 2002 at 03:58:50, Gerd Isenberg wrote:
>
>If i may remind you the hyperbola guy said that his idea
>was just a 'thought'. Nothing concrete implemented.
Yes.
>
>Anyway, which processor did you test it at for the speed
>and were both tests within the mmx and how would it do outside
>mmx and inside the normal processor?
>
Athlon XP2100+ 512MB SDRAM
All tests with mmx-registers with a dumb loop:
for (int i = 0; i < 1000000000; ++i) // 10**9 times
{
__asm movq mm6, [emptySquares]
__asm movq mm1, [queens]
getQueenAttacksMMX();
__asm movq [queenAttacks], mm0
}
none inlined
getRookAttacksMMX() 24 seconds
getQueenAttacksMMX() 51 seconds
The classical rotated Lookup (inlined also with mmx)
getQueenAttacks (i&63) 35 seconds
getQueenAttacks (i&1) 55!seconds
General purpose registers became interessant with hammer (eight additional) for
this algorithms. But also hammer is compatible with mmx and xmm (SSE/SSE2)
register sets. So i let the compiler his general purpose registers to use myself
the mmx- or xmm-registers for this low level stuff.
With x86-32 architecture you can hold only up to three bitboards in 32bit
registers.
One drawback with current athlon is that returning a bitboard in edx:eax is
rather expensive. It requires two "movd reg32,mmxreg" vector-path instructions.
That should become clearly better with hammer using only one "movq rax,mm0".
Another drawback with x86-32 is the probably missing 8-Byte alignment of pushed
actual bitboard parameters on the stack. Therefore it's recommend to use
movd mm0, [bp]
punpckldq mm0, [bp+4]
instead of one
movq mm0, [bp]
>I mean P4 and probably future P4s have just 1 mmx register.
>not a nice thought :)
Hmm, P4 has 8 mmx and 8 xmm (128bit) registers (SSE/SEE2). I can't believe that
they will reduce their registers and make a lot of (multimedia) software
incompatible.
See you in Leiden,
Gerd
<snip>
// input: mm1 queens
// mm6 emptySquares
// output: mm0 queenAttacks
// all mmx registers changed
//==============================
void getQueenAttacksMMX()
{
__asm
{ // up/down Kogge-Stone
movq mm2, mm1 ; generator upg
movq mm4, mm1 ; generator dng
movq mm5, mm6 ; propagator upp
movq mm7, mm6 ; propagator dnp
psllq mm2, 8 ; upg << 8
psrlq mm4, 8 ; dng >> 8
psllq mm5, 8 ; upp << 8
psrlq mm7, 8 ; dnp >> 8
pand mm2, mm6 ; upp & (upg << 8)
pand mm4, mm6 ; dnp & (dng >> 8)
por mm2, mm1 ; upg |= upp & (upg << 8)
por mm4, mm1 ; dng |= dnp & (dng >> 8)
pand mm5, mm6 ; upp &= upp << 8
pand mm7, mm6 ; dnp &= dnp >> 8
movq mm0, mm2 ; upg
movq mm3, mm4 ; dng
psllq mm2, 16 ; upg << 16
psrlq mm4, 16 ; dng >> 16
pand mm2, mm5 ; upp & (upg << 16)
pand mm4, mm7 ; dnp & (dng >> 16)
por mm2, mm0 ; upg |= upp & (upg << 16)
por mm4, mm3 ; dng |= dnp & (dng >> 16)
movq mm0, mm5 ; upp
movq mm3, mm7 ; dnp
psllq mm5, 16 ; upp << 16
psrlq mm7, 16 ; dnp >> 16
pand mm5, mm0 ; upp &= p << 16
movq mm0, mm2 ; upg
pand mm7, mm3 ; dnp &= p >> 16
movq mm3, mm4 ; dng
psrlq mm4, 32 ; dng >> 32
psllq mm0, 32 ; upg << 32
pand mm0, mm5 ; upp & (upg << 32)
// right with x^(x-2)
pcmpeqd mm5, mm5 ; 0xffffffffffffffff -1
pand mm4, mm7 ; dnp & (dng >> 32)
pcmpeqd mm7, mm7 ; 0xffffffffffffffff -1
por mm0, mm2 ; upg |= upp & (upg << 32)
pxor mm5, mm6 ; not empty ==> occupied
por mm4, mm3 ; dng |= dnp & (dng >> 32)
por mm5, mm1 ; force queens as subset of occupied
psllq mm0, 8 ; final shift up ==> queenAttacks
psrlq mm4, 8 ; final shift dn
movq mm3, mm5 ; occupied
psubb mm5, mm1 ; occupied - rooks
psubb mm5, mm1 ; occupied - 2*rooks
por mm0, mm4 ; queenAttacks |= down
pxor mm4, mm4 ; 0x0000000000000000
psubb mm4, mm7 ; 0x0101010101010101 0-(-1)
pxor mm5, mm3 ; right := occupied ^ (occupied - 2*rooks)
// diagonals and left dumb7fill
movq mm2, mm1 ; leftup
psubb mm7, mm4 ; 0xfefefefefefefefe notA 0xff-0x01
por mm0, mm5 ; queenAttacks |= right
movq mm5, mm7 ; 0xfefefefefefefefe notA
pand mm2, mm7 ; clear left occupied or a file
psrlq mm5, 1 ; 0x7f7f7f7f7f7f7f7f notH
movq mm3, mm2 ; leftdown
pand mm1, mm5 ; clear right occupied or h file
pand mm7, mm6 ; to clear left occupied or a-file
pand mm6, mm5 ; to clear right occupied or h-file
movq mm4, mm1 ; rightdown
movq mm5, mm2 ; left
// 1. fill diagonals and left
psllq mm1, 9 ; rightup
psrlq mm4, 7 ; rightdown
psllq mm2, 7 ; leftup
psrlq mm3, 9 ; leftdown
psrlq mm5, 1 ; left
por mm0, mm1 ; queenAttacks |= rightup
por mm0, mm4 ; queenAttacks |= rightdown
por mm0, mm2 ; queenAttacks |= leftup
por mm0, mm3 ; queenAttacks |= leftdown
por mm0, mm5 ; queenAttacks |= left
pand mm1, mm6 ; clear rightup occupied or h file
pand mm4, mm6 ; clear rightdown occupied or h file
pand mm2, mm7 ; clear leftup occupied or a file
pand mm3, mm7 ; clear leftdown occupied or a file
pand mm5, mm7 ; clear left occupied or a file
// 2. fill diagonals and left
psllq mm1, 9 ; rightup
psrlq mm4, 7 ; rightdown
psllq mm2, 7 ; leftup
psrlq mm3, 9 ; leftdown
psrlq mm5, 1 ; left
por mm0, mm1 ; queenAttacks |= rightup
por mm0, mm4 ; queenAttacks |= rightdown
por mm0, mm2 ; queenAttacks |= leftup
por mm0, mm3 ; queenAttacks |= leftdown
por mm0, mm5 ; queenAttacks |= left
pand mm1, mm6 ; clear rightup occupied or h file
pand mm4, mm6 ; clear rightdown occupied or h file
pand mm2, mm7 ; clear leftup occupied or a file
pand mm3, mm7 ; clear leftdown occupied or a file
pand mm5, mm7 ; clear left occupied or a file
// 3. fill diagonals and left
psllq mm1, 9 ; rightup
psrlq mm4, 7 ; rightdown
psllq mm2, 7 ; leftup
psrlq mm3, 9 ; leftdown
psrlq mm5, 1 ; left
por mm0, mm1 ; queenAttacks |= rightup
por mm0, mm4 ; queenAttacks |= rightdown
por mm0, mm2 ; queenAttacks |= leftup
por mm0, mm3 ; queenAttacks |= leftdown
por mm0, mm5 ; queenAttacks |= left
pand mm1, mm6 ; clear rightup occupied or h file
pand mm4, mm6 ; clear rightdown occupied or h file
pand mm2, mm7 ; clear leftup occupied or a file
pand mm3, mm7 ; clear leftdown occupied or a file
pand mm5, mm7 ; clear left occupied or a file
// 4. fill diagonals and left
psllq mm1, 9 ; rightup
psrlq mm4, 7 ; rightdown
psllq mm2, 7 ; leftup
psrlq mm3, 9 ; leftdown
psrlq mm5, 1 ; left
por mm0, mm1 ; queenAttacks |= rightup
por mm0, mm4 ; queenAttacks |= rightdown
por mm0, mm2 ; queenAttacks |= leftup
por mm0, mm3 ; queenAttacks |= leftdown
por mm0, mm5 ; queenAttacks |= left
pand mm1, mm6 ; clear rightup occupied or h file
pand mm4, mm6 ; clear rightdown occupied or h file
pand mm2, mm7 ; clear leftup occupied or a file
pand mm3, mm7 ; clear leftdown occupied or a file
pand mm5, mm7 ; clear left occupied or a file
// 5. fill diagonals and left
psllq mm1, 9 ; rightup
psrlq mm4, 7 ; rightdown
psllq mm2, 7 ; leftup
psrlq mm3, 9 ; leftdown
psrlq mm5, 1 ; left
por mm0, mm1 ; queenAttacks |= rightup
por mm0, mm4 ; queenAttacks |= rightdown
por mm0, mm2 ; queenAttacks |= leftup
por mm0, mm3 ; queenAttacks |= leftdown
por mm0, mm5 ; queenAttacks |= left
pand mm1, mm6 ; clear rightup occupied or h file
pand mm4, mm6 ; clear rightdown occupied or h file
pand mm2, mm7 ; clear leftup occupied or a file
pand mm3, mm7 ; clear leftdown occupied or a file
pand mm5, mm7 ; clear left occupied or a file
// 6. fill diagonals and left
psllq mm1, 9 ; rightup
psrlq mm4, 7 ; rightdown
psllq mm2, 7 ; leftup
psrlq mm3, 9 ; leftdown
psrlq mm5, 1 ; left
por mm0, mm1 ; queenAttacks |= rightup
por mm0, mm4 ; queenAttacks |= rightdown
por mm0, mm2 ; queenAttacks |= leftup
por mm0, mm3 ; queenAttacks |= leftdown
por mm0, mm5 ; queenAttacks |= left
pand mm1, mm6 ; clear rightup occupied or h file
pand mm4, mm6 ; clear rightdown occupied or h file
pand mm2, mm7 ; clear leftup occupied or a file
pand mm3, mm7 ; clear leftdown occupied or a file
pand mm5, mm7 ; clear left occupied or a file
// 7. fill diagonals and left
psllq mm1, 9 ; rightup
psrlq mm4, 7 ; rightdown
psllq mm2, 7 ; leftup
psrlq mm3, 9 ; leftdown
psrlq mm5, 1 ; left
por mm0, mm1 ; queenAttacks |= rightup
por mm0, mm4 ; queenAttacks |= rightdown
por mm0, mm2 ; queenAttacks |= leftup
por mm0, mm3 ; queenAttacks |= leftdown
por mm0, mm5 ; queenAttacks |= left
}
}
The classical rotated bitboard approach i compared with:
#define BITBOARD_ALIGN __declspec(align(8))
BitBoard BITBOARD_ALIGN sFileAtta[64][64]; // [sq][innerOccupiedstate]
BitBoard sRankAtta[64][64];
BitBoard sA1H8Atta[64][64];
BitBoard sH1A8Atta[64][64];
BitBoard OccuBB90 = 0;
BitBoard OccuBB00 = 0;
BitBoard OccuBBA1H8 = 0;
BitBoard OccuBBH1A8 = 0;
// output mm0 queenAttacks
//==========================
void getQueenAttacks (int sq)
{
__asm
{
mov eax, sq
mov edx, eax
mov ecx, eax
and edx, 7 ; file
shr ecx, 3 ; rank
mov edi, ecx ; rank to edi
shl eax, 9
movzx edx, [OccuBB90 + edx]; state on file
movzx ecx, [OccuBB00 + ecx]; state on rank
and edx, 0x7e
and ecx, 0x7e
movq mm0, [sFileAtta + eax + edx*4]
por mm0, [sRankAtta + eax + ecx*4]
mov edx, eax
shr edx, 9 ; square = file here because of "and 7"
mov ecx, edx
sub edx, edi ; file - rank
not ecx
sub ecx, edi ; ~file - rank
and edx, 7
and ecx, 7
movzx edx, [OccuBBA1H8 + edx]
movzx ecx, [OccuBBH1A8 + ecx]
and edx, 0x7e
and ecx, 0x7e
por mm0, [sA1H8Atta + eax + edx*4]
por mm0, [sH1A8Atta + eax + ecx*4]
}
}
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.