Author: Gerd Isenberg
Date: 03:58:19 03/31/05
Hi all SIMD-freaks,
a few more or less useless but hopefully didactical gems about shuffling and
rotating bits and bytes around, using SSE2-intrinsics for P4, Centrino or AMD64
using msvc 2005 beta compiler (intel compiler should work as well).
http://www.talkchess.com/forums/1/message.html?418648
The first gem converts a "classical" 64-byte board-array with a value range
0..15 per byte to a quad-bitboard where each nibble is coded vertically in four
bitboards:
input: BYTE board[64] with nibbles only
output: BitBoard qbb[4] with
for (int i=0; i < 64; i++) {
bit[i] of qbb[0] = board[i] & 1;
bit[i] of qbb[1] = (board[i] & 2) >> 1;
bit[i] of qbb[2] = (board[i] & 4) >> 2;
bit[i] of qbb[3] = (board[i] & 8) >> 3;
}
It seems that using 16 times PMOVMSKB reg32/64,xmmreg is the fastest approach
here, despite PMOVMSKB is 3 cycle vector path the routine executes in less than
60 cycles.
From AMD's manual:
==============================================================================
PMOVMSKB Packed Move Mask Byte
Moves the most-significant bit of each byte in the source operand to the
destination, with zero-extension to 32 bits. The destination and source operands
are a 32-bit general-purpose register and an XMM register. The result is written
to the low-order word of the general-purpose register.
==============================================================================
Note that getting one 16-bit vector {59,51,43,35,27,19,11,3} from 16 consecutive
board bytes is the consecutive order of 16 bits "3" in quad-bitboard[3]. So the
approach is quite straight-foreward:
void board2quadBB(BitBoard qbb[], const BYTE board[])
{
__m128i x0, x1, x2, x3;
__m128i* ps = (__m128i*) board;
int* pq = (int*)qbb;
x0 = _mm_slli_epi16(_mm_load_si128 (ps+0), 4);
x1 = _mm_slli_epi16(_mm_load_si128 (ps+1), 4);
x2 = _mm_slli_epi16(_mm_load_si128 (ps+2), 4);
x3 = _mm_slli_epi16(_mm_load_si128 (ps+3), 4);
pq[6] = _mm_movemask_epi8(x0) + (_mm_movemask_epi8(x1)<<16);
pq[7] = _mm_movemask_epi8(x2) + (_mm_movemask_epi8(x3)<<16);
x0 = _mm_slli_epi16(x0, 1);
x1 = _mm_slli_epi16(x1, 1);
x2 = _mm_slli_epi16(x2, 1);
x3 = _mm_slli_epi16(x3, 1);
pq[4] = _mm_movemask_epi8(x0) + (_mm_movemask_epi8(x1)<<16);
pq[5] = _mm_movemask_epi8(x2) + (_mm_movemask_epi8(x3)<<16);
x0 = _mm_slli_epi16(x0, 1);
x1 = _mm_slli_epi16(x1, 1);
x2 = _mm_slli_epi16(x2, 1);
x3 = _mm_slli_epi16(x3, 1);
pq[2] = _mm_movemask_epi8(x0) + (_mm_movemask_epi8(x1)<<16);
pq[3] = _mm_movemask_epi8(x2) + (_mm_movemask_epi8(x3)<<16);
x0 = _mm_slli_epi16(x0, 1);
x1 = _mm_slli_epi16(x1, 1);
x2 = _mm_slli_epi16(x2, 1);
x3 = _mm_slli_epi16(x3, 1);
pq[0] = _mm_movemask_epi8(x0) + (_mm_movemask_epi8(x1)<<16);
pq[1] = _mm_movemask_epi8(x2) + (_mm_movemask_epi8(x3)<<16);
}
How about the "inverse" function, to convert a quad-bitboard back to a board
array?
for (int i=0; i< 64; i++) {
bit[0] of board[i] = bit[i] of qbb[0];
bit[1] of board[i] = bit[i] of qbb[1];
bit[2] of board[i] = bit[i] of qbb[2];
bit[3] of board[i] = bit[i] of qbb[3];
}
The PMOVMSKB approach to get consecutive bits 63,55,47,39,31,23,15,7 from one of
the quad bitboards is not really helpfull here, because such bits must be stored
at bit-position 0-3 of bytes with appropriate index 63,55,47,39,31,23,15,7 -
hmm.
Here the bit-byte expanding idea used in the SSE2-dot-product comes in mind.
Applying this bit-byte expanding trick four times for each of the quad-bitboards
and masking the resulting {0x00|0xff}-bytes with 1,2,4,8 and oring all sets
together leaves the original 64-byte board in four-xmm registers. This procedure
takes about 120 cycles:
void quadBB2Board_first_try (BYTE b[], const BitBoard q[])
{
static const BitBoard XMM_ALIGN masks[10] = {
0x8040201008040201, 0x8040201008040201,
0x0101010101010101, 0x0101010101010101,
0x0202020202020202, 0x0202020202020202,
0x0404040404040404, 0x0404040404040404,
0x0808080808080808, 0x0808080808080808
};
__m128i t0,t1,t2,t3, b0,b1,b2,b3;
__m128i* pm = (__m128i*) masks;
__m128i* pq = (__m128i*) q;
t0 = _mm_load_si128(pq+0);
t0 = _mm_unpacklo_epi8 (t0, t0);
t2 = _mm_unpackhi_epi16 (t0, t0);
t0 = _mm_unpacklo_epi16 (t0, t0);
t1 = _mm_unpackhi_epi32 (t0, t0);
t0 = _mm_unpacklo_epi32 (t0, t0);
t3 = _mm_unpackhi_epi32 (t2, t2);
t2 = _mm_unpacklo_epi32 (t2, t2);
t0 = _mm_and_si128 (t0, _mm_load_si128(pm));
t1 = _mm_and_si128 (t1, _mm_load_si128(pm));
t2 = _mm_and_si128 (t2, _mm_load_si128(pm));
t3 = _mm_and_si128 (t3, _mm_load_si128(pm));
t0 = _mm_cmpeq_epi8 (t0, _mm_load_si128(pm));
t1 = _mm_cmpeq_epi8 (t1, _mm_load_si128(pm));
t2 = _mm_cmpeq_epi8 (t2, _mm_load_si128(pm));
t3 = _mm_cmpeq_epi8 (t3, _mm_load_si128(pm));
b0 = _mm_and_si128 (t0, _mm_load_si128 (pm+1));
b1 = _mm_and_si128 (t1, _mm_load_si128 (pm+1));
b2 = _mm_and_si128 (t2, _mm_load_si128 (pm+1));
b3 = _mm_and_si128 (t3, _mm_load_si128 (pm+1));
t0 = _mm_load_si128(pq+0);
t0 = _mm_unpackhi_epi8 (t0, t0);
t2 = _mm_unpackhi_epi16 (t0, t0);
t0 = _mm_unpacklo_epi16 (t0, t0);
t1 = _mm_unpackhi_epi32 (t0, t0);
t0 = _mm_unpacklo_epi32 (t0, t0);
t3 = _mm_unpackhi_epi32 (t2, t2);
t2 = _mm_unpacklo_epi32 (t2, t2);
t0 = _mm_and_si128 (t0, _mm_load_si128(pm));
t1 = _mm_and_si128 (t1, _mm_load_si128(pm));
t2 = _mm_and_si128 (t2, _mm_load_si128(pm));
t3 = _mm_and_si128 (t3, _mm_load_si128(pm));
t0 = _mm_cmpeq_epi8 (t0, _mm_load_si128(pm));
t1 = _mm_cmpeq_epi8 (t1, _mm_load_si128(pm));
t2 = _mm_cmpeq_epi8 (t2, _mm_load_si128(pm));
t3 = _mm_cmpeq_epi8 (t3, _mm_load_si128(pm));
t0 = _mm_and_si128 (t0, _mm_load_si128 (pm+2));
t1 = _mm_and_si128 (t1, _mm_load_si128 (pm+2));
t2 = _mm_and_si128 (t2, _mm_load_si128 (pm+2));
t3 = _mm_and_si128 (t3, _mm_load_si128 (pm+2));
b0 = _mm_or_si128 (b0, t0);
b1 = _mm_or_si128 (b1, t1);
b2 = _mm_or_si128 (b2, t2);
b3 = _mm_or_si128 (b3, t3);
t0 = _mm_load_si128(pq+1);
t0 = _mm_unpacklo_epi8 (t0, t0);
t2 = _mm_unpackhi_epi16 (t0, t0);
t0 = _mm_unpacklo_epi16 (t0, t0);
t1 = _mm_unpackhi_epi32 (t0, t0);
t0 = _mm_unpacklo_epi32 (t0, t0);
t3 = _mm_unpackhi_epi32 (t2, t2);
t2 = _mm_unpacklo_epi32 (t2, t2);
t0 = _mm_and_si128 (t0, _mm_load_si128(pm));
t1 = _mm_and_si128 (t1, _mm_load_si128(pm));
t2 = _mm_and_si128 (t2, _mm_load_si128(pm));
t3 = _mm_and_si128 (t3, _mm_load_si128(pm));
t0 = _mm_cmpeq_epi8 (t0, _mm_load_si128(pm));
t1 = _mm_cmpeq_epi8 (t1, _mm_load_si128(pm));
t2 = _mm_cmpeq_epi8 (t2, _mm_load_si128(pm));
t3 = _mm_cmpeq_epi8 (t3, _mm_load_si128(pm));
t0 = _mm_and_si128 (t0, _mm_load_si128 (pm+3));
t1 = _mm_and_si128 (t1, _mm_load_si128 (pm+3));
t2 = _mm_and_si128 (t2, _mm_load_si128 (pm+3));
t3 = _mm_and_si128 (t3, _mm_load_si128 (pm+3));
b0 = _mm_or_si128 (b0, t0);
b1 = _mm_or_si128 (b1, t1);
b2 = _mm_or_si128 (b2, t2);
b3 = _mm_or_si128 (b3, t3);
t0 = _mm_load_si128(pq+1);
t0 = _mm_unpackhi_epi8 (t0, t0);
t2 = _mm_unpackhi_epi16 (t0, t0);
t0 = _mm_unpacklo_epi16 (t0, t0);
t1 = _mm_unpackhi_epi32 (t0, t0);
t0 = _mm_unpacklo_epi32 (t0, t0);
t3 = _mm_unpackhi_epi32 (t2, t2);
t2 = _mm_unpacklo_epi32 (t2, t2);
t0 = _mm_and_si128 (t0, _mm_load_si128(pm));
t1 = _mm_and_si128 (t1, _mm_load_si128(pm));
t2 = _mm_and_si128 (t2, _mm_load_si128(pm));
t3 = _mm_and_si128 (t3, _mm_load_si128(pm));
t0 = _mm_cmpeq_epi8 (t0, _mm_load_si128(pm));
t1 = _mm_cmpeq_epi8 (t1, _mm_load_si128(pm));
t2 = _mm_cmpeq_epi8 (t2, _mm_load_si128(pm));
t3 = _mm_cmpeq_epi8 (t3, _mm_load_si128(pm));
t0 = _mm_and_si128 (t0, _mm_load_si128 (pm+4));
t1 = _mm_and_si128 (t1, _mm_load_si128 (pm+4));
t2 = _mm_and_si128 (t2, _mm_load_si128 (pm+4));
t3 = _mm_and_si128 (t3, _mm_load_si128 (pm+4));
b0 = _mm_or_si128 (b0, t0);
b1 = _mm_or_si128 (b1, t1);
b2 = _mm_or_si128 (b2, t2);
b3 = _mm_or_si128 (b3, t3);
__m128i* pb = (__m128i*) b;
_mm_store_si128 (pb+0, b0);
_mm_store_si128 (pb+1, b1);
_mm_store_si128 (pb+2, b2);
_mm_store_si128 (pb+3, b3);
}
One possible improvement is applying four times the shorter "rotated" expand and
to rotate (swap row and columns) all 64 bytes finally after a combined or - this
is about 100 cycles.
The 90-degree byte[64] rotation trick is shown separatly, as it might be usefull
elsewhere:
for (row=0; row < 8; row++)
for (col=0; col < 8; col++)
target[row][col] = source[col][row];
void rotate8x8bytes (BYTE target[], const BYTE source[] /* XMM_ALIGN */)
{
__m128i x0, x1, x2, x3, y0, y1, y2, y3;
__m128i* ps = (__m128i*) source;
__m128i* pt = (__m128i*) target;
x0 = _mm_load_si128 (ps+0);
x1 = _mm_load_si128 (ps+1);
x2 = _mm_load_si128 (ps+2);
x3 = _mm_load_si128 (ps+3);
y0 = _mm_unpackhi_epi64 (x0, x0);
y1 = _mm_unpackhi_epi64 (x1, x1);
y2 = _mm_unpackhi_epi64 (x2, x2);
y3 = _mm_unpackhi_epi64 (x3, x3);
x0 = _mm_unpacklo_epi8 (x0, y0);
x1 = _mm_unpacklo_epi8 (x1, y1);
x2 = _mm_unpacklo_epi8 (x2, y2);
x3 = _mm_unpacklo_epi8 (x3, y3);
y0 = _mm_unpacklo_epi16 (x0, x1);
y1 = _mm_unpackhi_epi16 (x0, x1);
y2 = _mm_unpacklo_epi16 (x2, x3);
y3 = _mm_unpackhi_epi16 (x2, x3);
x0 = _mm_unpacklo_epi32 (y0, y2);
x1 = _mm_unpackhi_epi32 (y0, y2);
x2 = _mm_unpacklo_epi32 (y1, y3);
x3 = _mm_unpackhi_epi32 (y1, y3);
_mm_store_si128 (pt+0, x0);
_mm_store_si128 (pt+1, x1);
_mm_store_si128 (pt+2, x2);
_mm_store_si128 (pt+3, x3);
}
Finally the conversion of a quad-bitboard to the board[64] array:
void quadBB2Board(BYTE board[], const BitBoard quad[])
{
static const BitBoard XMM_ALIGN masks[8] = {
0x0101010101010101, 0x0202020202020202,
0x0404040404040404, 0x0808080808080808,
0x1010101010101010, 0x2020202020202020,
0x4040404040404040, 0x8080808080808080
};
__m128i t0,t1,t2,t3, b0,b1,b2,b3;
__m128i* pm = (__m128i*) masks;
__m128i* pq = (__m128i*) quad;
__m128i* pb = (__m128i*) board;
// 1. bitboard 0x02:0x01
t0 = _mm_load_si128(pq+0);
t1 = _mm_unpacklo_epi64(t0, t0);
b0 = _mm_and_si128 (t1, _mm_load_si128(pm+0));
b1 = _mm_srli_epi64 ( _mm_and_si128 (t1, _mm_load_si128(pm+1)), 2);
b2 = _mm_srli_epi64 ( _mm_and_si128 (t1, _mm_load_si128(pm+2)), 4);
b3 = _mm_srli_epi64 ( _mm_and_si128 (t1, _mm_load_si128(pm+3)), 6);
// 2. bitboard 0x04:0x02
t2 = _mm_unpackhi_epi64(t0, t0);
b0 = _mm_or_si128 ( b0, _mm_slli_epi64
( _mm_and_si128 (t2, _mm_load_si128(pm+0)), 1));
b1 = _mm_or_si128 ( b1, _mm_srli_epi64
( _mm_and_si128 (t2, _mm_load_si128(pm+1)), 1));
b2 = _mm_or_si128 ( b2, _mm_srli_epi64
( _mm_and_si128 (t2, _mm_load_si128(pm+2)), 3));
b3 = _mm_or_si128 ( b3, _mm_srli_epi64
( _mm_and_si128 (t2, _mm_load_si128(pm+3)), 5));
// 3. bitboard 0x08:0x04
t0 = _mm_load_si128(pq+1);
t1 = _mm_unpacklo_epi64(t0, t0);
b0 = _mm_or_si128 ( b0, _mm_slli_epi64
( _mm_and_si128 (t1, _mm_load_si128(pm+0)), 2));
b1 = _mm_or_si128 ( b1,
_mm_and_si128 (t1, _mm_load_si128(pm+1)) );
b2 = _mm_or_si128 ( b2, _mm_srli_epi64
( _mm_and_si128 (t1, _mm_load_si128(pm+2)), 2));
b3 = _mm_or_si128 ( b3, _mm_srli_epi64
( _mm_and_si128 (t1, _mm_load_si128(pm+3)), 4));
// 4. bitboard 0x10:0x08
t2 = _mm_unpackhi_epi64(t0, t0);
b0 = _mm_or_si128 ( b0, _mm_slli_epi64
( _mm_and_si128 (t2, _mm_load_si128(pm+0)), 3));
b1 = _mm_or_si128 ( b1, _mm_slli_epi64
( _mm_and_si128 (t2, _mm_load_si128(pm+1)), 1));
b2 = _mm_or_si128 ( b2, _mm_srli_epi64
( _mm_and_si128 (t2, _mm_load_si128(pm+2)), 1));
b3 = _mm_or_si128 ( b3, _mm_srli_epi64
( _mm_and_si128 (t2, _mm_load_si128(pm+3)), 3));
// rotate 8*8 bytes (512 bit) in b0,b1,b2,b3
b0 = _mm_unpacklo_epi8(b0, _mm_srli_epi64 (_mm_unpackhi_epi64(b0,b0),1));
b1 = _mm_unpacklo_epi8(b1, _mm_srli_epi64 (_mm_unpackhi_epi64(b1,b1),1));
b2 = _mm_unpacklo_epi8(b2, _mm_srli_epi64 (_mm_unpackhi_epi64(b2,b2),1));
b3 = _mm_unpacklo_epi8(b3, _mm_srli_epi64 (_mm_unpackhi_epi64(b3,b3),1));
t0 = _mm_unpacklo_epi16 (b0, b1);
t1 = _mm_unpackhi_epi16 (b0, b1);
t2 = _mm_unpacklo_epi16 (b2, b3);
t3 = _mm_unpackhi_epi16 (b2, b3);
b0 = _mm_unpacklo_epi32 (t0, t2);
b1 = _mm_unpackhi_epi32 (t0, t2);
b2 = _mm_unpacklo_epi32 (t1, t3);
b3 = _mm_unpackhi_epi32 (t1, t3);
_mm_store_si128 (pb+0, b0);
_mm_store_si128 (pb+1, b1);
_mm_store_si128 (pb+2, b2);
_mm_store_si128 (pb+3, b3);
}
On intrinsics see
http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclang/html/vclrfmmxssesse2intrisics.asp
Cheers,
Gerd
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.