Author: Gerd Isenberg
Date: 13:14:10 07/17/04
Got it - but count the cycles by yourself.
With some 90-degree rotation trick of the 64-byte weight array, the task of a
dot product is rather easy with AMD64 SSE2 instructions. 31 instructions, only
134 bytes for that routine in 32-bit mode. Most of the time four independent
instruction chains.
Considering the bitboard inside the lower half of a 128-byte xmm-register, a
punpcklqdq xmm,xmm instruction (Unpack and Interleave Low Quadwords) copies it
into the upper half.
With three further 128-bit movdqa we have now eight copies of the bitboard in
four xmm-registers. Now the eight bitboards are anded with:
0x0202020202020202:0x0101010101010101
0x0808080808080808:0x0404040404040404
0x2020202020202020:0x1010101010101010
0x8080808080808080:0x4040404040404040
to mask each bit bytewise, that is only one bit per byte.
The 90-degree rotation of the resulting byte vector and therefore the weight
array too, is because 0x0101010101010101 masks a1,a2,a3...a8, while
0x8080808080808080 masks h1..h8.
The next step is to use the "Packed Compare Equal Bytes" instruction with zero
to get 0 and -1 masks. Since equal zero produces 0xff and unequal zero produces
0x00, the result is complemented for our purpose. Fortunately there is a
pandn-instruction. The result is an 64-byte {0|-1}-vector with
a1,a2,a3...a8,b1...h8.
The rest was already mentioned, four further ands with the appropriate mapped
weight vector, then adding all bytes together to one 16 bit word.
Cheers,
Gerd
//=========================================================
// some sample source, i tried with MSC6 and inline assemby
//=========================================================
#include <stdio.h>
typedef unsigned __int64 BitBoard;
typedef unsigned char WType;
#define CACHE_LINE 64
#define CACHE_ALIGN __declspec(align(CACHE_LINE))
struct SMaskConsts
{
BitBoard C01; BitBoard C02;
BitBoard C04; BitBoard C08;
BitBoard C10; BitBoard C20;
BitBoard C40; BitBoard C80;
};
const SMaskConsts CACHE_ALIGN MaskConsts =
{
0x0101010101010101, 0x0202020202020202,
0x0404040404040404, 0x0808080808080808,
0x1010101010101010, 0x2020202020202020,
0x4040404040404040, 0x8080808080808080,
};
int dotProduct(BitBoard bb, WType* weights)
{
__asm
{
movq xmm0, [bb] ; get the bb in lower
lea eax, [MaskConsts]
punpcklqdq xmm0, xmm0 ; .. and upper half
pxor xmm7, xmm7 ; zero for compare and byte diff
movdqa xmm1, xmm0
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pandn xmm0, [eax] ; mask the consecutive bits
pandn xmm1, [eax+16]
pandn xmm2, [eax+32]
pandn xmm3, [eax+48]
mov eax, [weights]
pcmpeqb xmm0, xmm7 ; build the -1|0 byte masks
pcmpeqb xmm1, xmm7
pcmpeqb xmm2, xmm7
pcmpeqb xmm3, xmm7
pand xmm0, [eax] ; and them with the weights
pand xmm1, [eax+16]
pand xmm2, [eax+32]
pand xmm3, [eax+48]
psadbw xmm0, xmm7 ; horizontal adds
psadbw xmm1, xmm7 ; bytes to word
psadbw xmm2, xmm7
psadbw xmm3, xmm7
paddd xmm0, xmm1 ; vertical add
paddd xmm2, xmm3
paddd xmm2, xmm0
movdqa xmm0, xmm2 ; finally add high and low quad words
punpckhqdq xmm0, xmm2 ; like shift right
paddd xmm0, xmm2
movd eax, xmm0 ; expensive to move in gp-register!
}
}
int main(int argc, char* argv[])
{
WType CACHE_ALIGN weights[64] =
{
1, 2, 3, 4, 4, 3, 2, 1, // a1..a8
2, 3, 4, 5, 5, 4, 3, 2, // b1..b8
3, 4, 5, 6, 6, 5, 4, 3, // c1..c8
4, 5, 6, 7, 7, 6, 5, 4, // d1..d8
4, 5, 6, 7, 7, 6, 5, 4, // e1..e8
3, 4, 5, 6, 6, 5, 4, 3, // f1..f8
2, 3, 4, 5, 5, 4, 3, 2, // g1..g8
1, 2, 3, 4, 4, 3, 2, 1, // h1..h8
};
int bonus = dotProduct(0xfedcba9876543210, weights);
return bonus;
}
This page took 0.01 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.