Author: Gerd Isenberg
Date: 05:16:33 03/28/05
Hi all,
Some notes on using Streaming SIMD Extensions 2 (SSE2) for computer chess,
specially on SSE2-Intrinsics:
Without stalls the SSE2-integer instruction throughput is about one instruction
per cycle (AMD64). That is because most SSE2-instructions have a latency of two
cycles and usually perform two 64-bit alu operations.
AMD64 32/64-bit general purpose instructions perform much better on single ints
or __int64 - up to four instructions per cycle.
OTOH as vectors are involved, SSE2 becomes relative faster, the smaller the
SIMD-Data Type is (Bitboard[2], int[4], shorts[8] or Byte[16] per xmm-register).
Also SSE2 has some powerfull instructions here and there, without an easy
gp-instructions equivalent.
Another aspect - SSE2-code reduces gp-register pressure and may execute in
"parallel" with independent gp-code. Future processors may also have 128-bit
alus and faster SSE-execution (let say one cycle latency ;-) as well.
IMHO there are some "nice" bitboard applications using SSE2 gems.
Three options to use SSE2 for vectors of integers with
8/16 (P4,Centrino/AMD64) 128-bit xmm-registers:
1. (Intel) compiler supports automatic vectorization.
2. (deprecated) inline assembly with msvc.
3. SSE2-Intrinsics (including C++ wrappers for vector classes)
As mentioned recently by Aart J.C. Bik, Intel Compiler is able of automatic
vectorization of loops on arrays of byte, short, int and __int64 with
appropriate SSE2-instructions. The results are promising, but there are still
some applications as a assembler domain, for instance
bit[64]*byte[64]-dot-product (see sample 1 below).
Since inline assembly is no longer supported by msc for AMD64, 64-bit mode. One
"wintel-portable" alternative solution is to use SSE2-intrinsics, as well for
P4, Centrino and AMD64 and it's intel clone with 64-bit extensions.
Using intrinsics is not really a win of abstraction - it reminds more on a
C-Level assembler, with the advantage the compiler allocates registers and
schedules instructions.
For me, in the meantime rather familar with see2-assembly mnemonics, the way to
port routines to SSE2-intrinsics is first to write (inline) assembly and then
performing a more or less automatic "one to one"-replacement of SSE2
assembler-instructions with the appropriate intrinsics. Since SSE2 instrinsics
prototype is often m128i = _mm_xxxx_si128 (__m128i, __m128i);
one may skip some movdqa xmm-reg, xmm-reg.
Also in the first step i keep the identifiers of xmm-registers as "usual"
__m128i register variables. Loading and storing data is a bit different, see
application 2 below.
I tested a few routines using ms visual C++ 2005 beta 32-bit compiler. The
generated code of the intrinsics looks well (no more unnessesary store and
loads) and the routines produce obout the same speep as the inline assembly
ones.
=============================================================================
Two applications:
1. The bit[64]*byte[64]-dot-product.
This is really a SSE2-domain!
#include <emmintrin.h>
typedef unsigned __int64 BitBoard;
#define XMM_ALIGN __declspec(align(16))
/* pure C version */
int dotProduct_C(BitBoard bb, BYTE weights[])
{
int sum = 0;
for (int i=0; i < 64; i++, bb >>= 1)
if ( bb & 1 ) sum += weights[i];
return sum;
}
/* intrinsic-version ~42 cycles */
/* for average weights < 64 ! */
int dotProduct(BitBoard bb, BYTE weights[] /* XMM_ALIGN */)
{
static const BitBoard XMM_ALIGN sbitmask[2] = {
0x8040201008040201,
0x8040201008040201
};
__m128i x0, x1, x2, x3, bm;
bm = _mm_load_si128 ( (__m128i*) sbitmask);
x0 = _mm_loadl_epi64 ( (__m128i*) &bb);
x0 = _mm_unpacklo_epi8 (x0, x0);
x2 = _mm_unpackhi_epi16 (x0, x0);
x0 = _mm_unpacklo_epi16 (x0, x0);
x1 = _mm_unpackhi_epi32 (x0, x0);
x0 = _mm_unpacklo_epi32 (x0, x0);
x3 = _mm_unpackhi_epi32 (x2, x2);
x2 = _mm_unpacklo_epi32 (x2, x2);
// extend bits to bytes
x0 = _mm_cmpeq_epi8 (_mm_and_si128 (x0, bm), bm);
x1 = _mm_cmpeq_epi8 (_mm_and_si128 (x1, bm), bm);
x2 = _mm_cmpeq_epi8 (_mm_and_si128 (x2, bm), bm);
x3 = _mm_cmpeq_epi8 (_mm_and_si128 (x3, bm), bm);
// multiply by "and" with -1 or 0
__m128i* pw = (__m128i*) weights;
x0 = _mm_and_si128 (x0, _mm_load_si128 (pw+0));
x1 = _mm_and_si128 (x1, _mm_load_si128 (pw+1));
x2 = _mm_and_si128 (x2, _mm_load_si128 (pw+2));
x3 = _mm_and_si128 (x3, _mm_load_si128 (pw+3));
// add all bytes (with saturation)
x0 = _mm_adds_epu8 (x0, x1);
x0 = _mm_adds_epu8 (x0, x2);
x0 = _mm_adds_epu8 (x0, x3);
x0 = _mm_sad_epu8 (x0, _mm_setzero_si128 ());
return _mm_extract_epi16 (x0, 0)
+ _mm_extract_epi16 (x0, 4);
}
2. Expanding a quad-bitboard to 16-disjoint bitboards.
This is more a domain of 64-bit compiler, using gp-registers.
But of course a clear win for a SSE2-capable x86-32.
/* pure C */
void quad2hexBB_C(BitBoard h[], const BitBoard q[])
{
h[ 0] = ~q[3] & ~q[2] & ~q[1] & ~q[0];
h[ 1] = ~q[3] & ~q[2] & ~q[1] & q[0];
h[ 2] = ~q[3] & ~q[2] & q[1] & ~q[0];
h[ 3] = ~q[3] & ~q[2] & q[1] & q[0];
h[ 4] = ~q[3] & q[2] & ~q[1] & ~q[0];
h[ 5] = ~q[3] & q[2] & ~q[1] & q[0];
h[ 6] = ~q[3] & q[2] & q[1] & ~q[0];
h[ 7] = ~q[3] & q[2] & q[1] & q[0];
h[ 8] = q[3] & ~q[2] & ~q[1] & ~q[0];
h[ 9] = q[3] & ~q[2] & ~q[1] & q[0];
h[10] = q[3] & ~q[2] & q[1] & ~q[0];
h[11] = q[3] & ~q[2] & q[1] & q[0];
h[12] = q[3] & q[2] & ~q[1] & ~q[0];
h[13] = q[3] & q[2] & ~q[1] & q[0];
h[14] = q[3] & q[2] & q[1] & ~q[0];
h[15] = q[3] & q[2] & q[1] & q[0];
}
/* inline assembly ~38 cycles */
void quad2hexBB_A(BitBoard h[], const BitBoard q[])
{
__asm
{
mov eax, [q]
pcmpeqd xmm7, xmm7 ; -1
movdqa xmm0, [eax+0*16]; q1 : q0
movdqa xmm4, [eax+1*16]; q3 : q2
mov eax, [h]
movdqa xmm5, xmm0
movdqa xmm1, xmm0
por xmm0, xmm4
pand xmm5, xmm4 ; q3 & q1 : q2 & q0
pxor xmm0, xmm7 ;~q3 & ~q1 : ~q2 & ~q0
pxor xmm1, xmm5 ;~q3 & q1 : ~q2 & q0
pxor xmm4, xmm5 ; q3 & ~q1 : q2 & ~q0
movdqa xmm6, xmm0
movdqa xmm7, xmm4
punpckhqdq xmm0, xmm0 ;~q3 & ~q1 :~q3 & ~q1
punpckhqdq xmm4, xmm4 ; q3 & ~q1 : q3 & ~q1
punpcklqdq xmm6, xmm1 ; ~q2 & q0 : ~q2 & ~q0
punpcklqdq xmm7, xmm5 ; q2 & q0 : q2 & ~q0
punpckhqdq xmm1, xmm1 ;~q3 & q1 :~q3 & q1
punpckhqdq xmm5, xmm5 ; q3 & q1 : q3 & q1
movdqa xmm2, xmm0
movdqa xmm3, xmm1
pand xmm0, xmm6 ;~q3~q2~q1 q0 :~q3~q2~q1~q0
pand xmm1, xmm6 ;~q3~q2 q1 q0 :~q3~q2 q1~q0
pand xmm2, xmm7 ;~q3 q2~q1 q0 :~q3 q2~q1~q0
pand xmm3, xmm7 ;~q3 q2 q1 q0 :~q3 q2 q1~q0
movdqa [eax+0*16], xmm0; 1:0
movdqa [eax+1*16], xmm1; 3:2
movdqa [eax+2*16], xmm2; 5:4
movdqa [eax+3*16], xmm3; 7:6
movdqa xmm0, xmm4
movdqa xmm1, xmm5
pand xmm4, xmm6 ; q3~q2~q1 q0 : q3~q2~q1~q0
pand xmm5, xmm6 ; q3~q2 q1 q0 : q3~q2 q1~q0
pand xmm0, xmm7 ; q3 q2~q1 q0 : q3 q2~q1~q0
pand xmm1, xmm7 ; q3 q2 q1 q0 : q3 q2 q1~q0
movdqa [eax+4*16], xmm4; 9:8
movdqa [eax+5*16], xmm5; B:A
movdqa [eax+6*16], xmm0; D:C
movdqa [eax+7*16], xmm1; F:E
}
}
/* intrinsic-version ~34 cyles */
void quad2hexBB(BitBoard h[], const BitBoard q[])
{
__m128i a, b, c, d, e, f;
__m128i* p = (__m128i*) q;
a = _mm_load_si128(p+0);
c = _mm_load_si128(p+1);
p = (__m128i*) h;
b = d = a;
a = _mm_or_si128 (a, c);
d = _mm_and_si128 (d, c);
e = a = _mm_xor_si128 (a, _mm_cmpeq_epi32 (f,f));
b = _mm_xor_si128 (b, d);
f = c = _mm_xor_si128 (c, d);
a = _mm_unpackhi_epi64 (a, a);
c = _mm_unpackhi_epi64 (c, c);
e = _mm_unpacklo_epi64 (e, b);
f = _mm_unpacklo_epi64 (f, d);
b = _mm_unpackhi_epi64 (b, b);
d = _mm_unpackhi_epi64 (d, d);
_mm_store_si128 (p+0, _mm_and_si128 (a, e));
_mm_store_si128 (p+1, _mm_and_si128 (b, e));
_mm_store_si128 (p+2, _mm_and_si128 (a, f));
_mm_store_si128 (p+3, _mm_and_si128 (b, f));
_mm_store_si128 (p+4, _mm_and_si128 (c, e));
_mm_store_si128 (p+5, _mm_and_si128 (d, e));
_mm_store_si128 (p+6, _mm_and_si128 (c, f));
_mm_store_si128 (p+7, _mm_and_si128 (d, f));
}
There is one (minor) problem in the code above, using _mm_cmpeq_epi32(f,f) with
not initialized, but same arguments as -1 setter forcing pcmpeqd xmmi,xmmi,
where the initial value of xmmi don't cares. The debug version fires a runtime
exception, but the release version was a few cycles faster, using not
initialized xmm-register variables. Unfortunately there is no -1 setter like
_mm_setzero_si128() for zero - at least i don't find one.
Cheers,
Gerd
This page took 0.01 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.