Author: Gerd Isenberg
Date: 16:52:13 09/20/02
Hallo all Bitboarders,
My initial aversion against mmx instructions disappears more and more...
Some simple tests seem promising, or at leat interesting. But i had not enough
time yet, to do the "real" test.
I made some modifications with the getRookAttacksMMX routine, i posted a few
days ago:
http://www.talkchess.com/forums/1/message.html?252198
Rather than "anding" with "notH" after one shift logical right (board left in HA
direction), it's smarter with "notA" before shifting, because that could be
combined in further "ands" to clear occupied squares in the fill bitboards.
Same trick is also possible with diagonals, so each fill iteration only takes 12
mmx instructions (4shifts, 4ors, 4ands, hard time for Kogge-Stone at least on
8x8 boards).
There is no need to read data anymore in those functions. They are inlined and
parameter and return bitboard are passed via mmx-register.
To get some feeling of "runtime behaviour", following (dumb) test on my AMD
XP2.1+ was done:
I called the inlined getQueenAttacksMMX in a loop 1,000,000,000 (10**9) times.
It took 57 seconds.
An inlined assembler routine with table lookup took 21 seconds with constant
square parameter, but up to 55 seconds with variable squares and occupied states
of zero (see code fragments below).
I don't have any idea about the probability of cache misses in "real" chess
programs, and about the penalty cycles it took to bring the data into the cache,
but the rather chaotic duration variation of the lookup loop may suggest
something.
57 seconds for 10**9 calls makes 57ns per getQueenAttacksMMX call on a ~1.9GHz
PC, that means ~108 cycles for at least 186mmx instuctions (10+6*12+8 +
16+6*12+8 +...).
If that is true, and mmx execution latency is two cycles, as reported by amd, it
implies the parallel processing of up to four independed mmx-instructions! Is
that possible, or did i make an error?
One Correction of one of my previous posts:
If you call getRookAttacksMMX with a set of two rooks, it's not possible to
extract the attacks for each rook, even if you have the "empty" board attacks of
each rook. That's only possible with a pair set of unequal colored bishops after
calling getBishopAttacksMMX.
Regards,
Gerd
// input: mm1 rooks (rookMover)
// mm6 freeSquares (not changed)
// output: mm0 rookattacks
//===========================================
__forceinline void getRookAttacksMMX()
{
__asm
{
pxor mm5, mm5 ; 0
pcmpeqd mm7, mm7 ; 0xffffffffffffffff
movq mm2, mm1 ; left
psubb mm5, mm7 ; 0x0101010101010101
movq mm3, mm1 ; up
pxor mm7, mm5 ; 0xfefefefefefefefe notA
movq mm4, mm1 ; down
pand mm2, mm7 ; clear left a-file before shift
psrlq mm5, 32+24-3 ; 8
pand mm7, mm6 ; to clear left occupied or a-file
// 1. straight fill in each direction
psrlq mm2, 1 ; left
paddb mm1, mm1 ; right
psllq mm3, mm5 ; up
psrlq mm4, mm5 ; down
movq mm0, mm2 ; rookAttacks = left
por mm0, mm1 ; rookAttacks |= right
por mm0, mm3 ; rookAttacks |= up
por mm0, mm4 ; rookAttacks |= down
pand mm2, mm7 ; clear left occupied or a file
pand mm1, mm6 ; clear right occupied
pand mm3, mm6 ; clear up occupied
pand mm4, mm6 ; clear down occupied
// 2. straight fill in each direction
....
// input: mm1 bishops (bishopMover)
// mm6 freeSquares (not changed)
// output: mm0 bishopAttacks
//===========================================
__forceinline void getBishopAttacksMMX()
{
__asm
{
pxor mm0, mm0 ; 0
pcmpeqd mm7, mm7 ; 0xffffffffffffffff
movq mm2, mm1 ; leftup
psubb mm0, mm7 ; 0x0101010101010101
movq mm3, mm1 ; leftdown
pxor mm7, mm0 ; 0xfefefefefefefefe notA
movq mm4, mm1 ; rightdown
movq mm5, mm7 ; 0xfefefefefefefefe notA
pand mm2, mm7 ; clear leftup occupied or a file
psrlq mm5, 1 ; 0x7f7f7f7f7f7f7f7f notH
pand mm3, mm7 ; clear leftdown occupied or a file
pand mm1, mm5 ; clear rightup occupied or h file
pand mm4, mm5 ; clear rightdown occupied or h file
pand mm7, mm6 ; to clear left occupied or a-file
pand mm5, mm6 ; to clear left occupied or h-file
// 1. diagonal fill in each direction
psllq mm1, 9 ; rightup
psrlq mm4, 7 ; rightdown
psllq mm2, 7 ; leftup
psrlq mm3, 9 ; leftdown
movq mm0, mm1 ; bishopAttacks = rightup
por mm0, mm4 ; bishopAttacks |= rightdown
por mm0, mm2 ; bishopAttacks |= leftup
por mm0, mm3 ; bishopAttacks |= leftdown
pand mm1, mm5 ; clear rightup occupied or h file
pand mm4, mm5 ; clear rightdown occupied or h file
pand mm2, mm7 ; clear leftup occupied or a file
pand mm3, mm7 ; clear leftdown occupied or a file
// 2. diagonal fill in each direction
....
__forceinline void getQueenAttacksMMX()
{
// getRookAttacksMMX() | getBishopAttacksMMX()
.......
The getQueenAttacks lookup code, i compared with:
BitBoard sFileAtta[64][64]; // [square][innerOccupiedState of rotated bitboards]
BitBoard sRankAtta[64][64];
BitBoard sA1H8Atta[64][64];
BitBoard sH1A8Atta[64][64];
BitBoard OccuBB90 = 0;
BitBoard OccuBB00 = 0;
BitBoard OccuBBA1H8 = 0;
BitBoard OccuBBH1A8 = 0;
// output: mm0 queenAttacks
//===========================
__forceinline void getQueenAttacks (int sq)
{
__asm
{
mov eax, sq
mov edx, eax
mov ecx, eax
and edx, 7 ; file
shr ecx, 3 ; rank
mov edi, ecx ; rank to edi
shl eax, 9
movzx edx, [OccuBB90 + edx] ; state of file
movzx ecx, [OccuBB00 + ecx] ; state of rank
and edx, 0x7e
and ecx, 0x7e
movq mm0, [sFileAtta + eax + edx*4]
por mm0, [sRankAtta + eax + ecx*4]
mov edx, eax
shr edx, 9 ; square
mov ecx, edx
sub edx, edi ; file - rank
not ecx
sub ecx, edi ; ~file - rank
and edx, 7
and ecx, 7
movzx edx, [OccuBBA1H8 + edx] ; state of dia1
movzx ecx, [OccuBBH1A8 + ecx] ; state of dia2
and edx, 0x7e
and ecx, 0x7e
por mm0, [sA1H8Atta + eax + edx*4]
por mm0, [sH1A8Atta + eax + ecx*4]
}
}
and some testloops:
21 seconds:
for (int i = 0; i < 1000000000; ++i)
getQueenAttacks (12);
35 seconds:
for (int i = 0; i < 1000000000; ++i)
getQueenAttacks (i&63);
55! seconds:
for (int i = 0; i < 1000000000; ++i)
getQueenAttacks (i&1);
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.