Author: Gerd Isenberg
Date: 07:07:42 05/31/04
Go up one level in this thread
On May 30, 2004 at 16:42:53, Dieter Buerssner wrote:
>Perhaps your results are not strange at all. Your function just seems to be fast
>when everything is inlined :-) On my computer, my method is a bit faster.
>
>I tried a few things. Some observations. With MSVC 6, changing your odd() and
>maj() functions to macros gives reproducably faster times here. 4.24 vs 4.49 ns
>per 32-bit popcount (This would translate to 84.8 vs 89.8 s in your 1e9 loop). I
>also used clock() for timing, but did it slightly different than you with some
>#define tricks, so that each function will be called inside exactly the same
>environment (each test run for only one function).
>
>With Gcc, using inline instead of the macro, was faster ...
>
>My pc_10bb needs 3.94 ns. It was coded not very well. One can also code it
>without the need of temporary arrays (code even gets smaller). Both Gcc and MSVC
>can translate it, so that it will need 6 registers (so no temporary space on the
>stack at all). An unrolled version will need 5 registers (about 280
>instructions). I show the loop version at the end.
>
>This somewhat more carfully loop version needs 3.35ns. Unrolled 3.08 ns. This is
>only 7.8 cycles per 32-bit popcount. The fastest I got your function, was 4.24
>ns/32-bit word (using MSVC, macro odd/maj and a version of popcount similar to
>yours, but without multiplication, and adding the counters one stage earlier -
>all called functions inlined).
Amazing Dieter!
I tried a MMX-version based on the "dead slow" popcount of amd's optimization
manual, with the eight add,major pairs. Even that takes about 42ns -> 2.1
ns/32-bit. To get an idea what is possible with AMD64 gp registers!
Cheers,
Gerd
#define CACHE_LINE 32
#define CACHE_ALIGN __declspec(align(CACHE_LINE))
__forceinline
int popCount10(BitBoard a[])
{
struct SBitCountConsts
{
BitBoard C33;
BitBoard C55;
BitBoard C0F;
};
const SBitCountConsts CACHE_ALIGN BitCountConsts =
{
0x3333333333333333, // C33
0x5555555555555555, // C55
0x0f0f0f0f0f0f0f0f, // C0F
};
__asm
{
mov eax, [a]
// mm0 t1 = odd( a[0], a[1], a[2]);
// mm1 t2 = maj( a[0], a[1], a[2]);
movq mm0, [eax]
movq mm7, [eax]
pxor mm0, [eax+8] ; x^y
pand mm7, [eax+8] ; x&y
movq mm1, mm0 ; x^y
pxor mm0, [eax+16] ; t1
pand mm1, [eax+16] ; (x^y)&z
por mm1, mm7 ; t2
// mm2 u1 = odd( a[3], a[4], a[5]);
// mm3 u2 = maj( a[3], a[4], a[5]);
movq mm2, [eax+24]
movq mm7, [eax+24]
pxor mm2, [eax+32] ; x^y
pand mm7, [eax+32] ; x&y
movq mm3, mm2 ; x^y
pand mm3, [eax+40] ; (x^y)&z
pxor mm2, [eax+40]
por mm3, mm7
// mm4 v1 = odd( a[6], a[7], a[8]);
// mm5 v2 = maj( a[6], a[7], a[8]);
movq mm4, [eax+48]
movq mm7, [eax+48]
pxor mm4, [eax+56] ; x^y
pand mm7, [eax+56] ; x&y
movq mm5, mm4 ; x^y
pand mm5, [eax+64] ; (x^y)&z
pxor mm4, [eax+64] ; v1
por mm5, mm7 ; v2
// w1 = odd( a[9], t1, u1);
// w2 = maj( a[9], t1, u1);
movq mm6, [eax+72]
movq mm7, [eax+72]
pxor mm6, mm0 ; x^y
pand mm7, mm0 ; x&y
movq mm0, mm6 ; x^y
pand mm0, mm2 ; (x^y)&z
pxor mm6, mm2 ; w1
por mm7, mm0 ; w2
// c[0] = odd(v1, w1, 0);
// x2 = maj(v1, w1, 0);
movq mm0, mm6
pand mm6, mm4 ; x2
pxor mm0, mm4 ; mm0 -> c0
// y2 = odd(t2, u2, v2);
// y4 = maj(t2, u2, v2);
movq mm2, mm1
pxor mm1, mm3 ; x^y
pand mm2, mm3 ; x&y
movq mm3, mm1 ; x^y
pand mm3, mm5 ; (x^y)&z
pxor mm1, mm5 ; y2
por mm3, mm2 ; y4
// c[1] = odd(w2, x2, y2);
// z4 = maj(w2, x2, y2);
movq mm2, mm1
pxor mm2, mm7 ; x^y
pand mm1, mm7 ; x&y
movq mm4, mm2 ; x^y
pand mm4, mm6 ; (x^y)&z
pxor mm2, mm6 ; mm2 -> c1
por mm4, mm1 ; z4
// c[2] = odd(y4, z4, 0);
// c[3] = maj(y4, z4, 0);
lea eax, [BitCountConsts]
movq mm6, mm4
pxor mm4, mm3 ; mm4 -> c2
pand mm6, mm3 ; mm6 -> c3
movq mm1,mm0
movq mm3,mm2
movq mm5,mm4
movq mm7,mm6
psrld mm0,1
psrld mm2,1
psrld mm4,1
psrld mm6,1
pand mm0,[eax].C55
pand mm2,[eax].C55
pand mm4,[eax].C55
pand mm6,[eax].C55
psubb mm1,mm0
psubb mm3,mm2
psubb mm5,mm4
psubb mm7,mm6
movq mm0,mm1
movq mm2,mm3
movq mm4,mm5
movq mm6,mm7
psrld mm1,2
psrld mm3,2
psrld mm5,2
psrld mm7,2
pand mm0,[eax].C33
pand mm1,[eax].C33
pand mm2,[eax].C33
pand mm3,[eax].C33
pand mm4,[eax].C33
pand mm5,[eax].C33
pand mm6,[eax].C33
pand mm7,[eax].C33
paddb mm0,mm1
paddb mm2,mm3
paddb mm4,mm5
paddb mm6,mm7
movq mm1,mm0
movq mm3,mm2
movq mm5,mm4
movq mm7,mm6
psrld mm0,4
psrld mm2,4
psrld mm4,4
psrld mm6,4
paddb mm0,mm1
paddb mm2,mm3
paddb mm4,mm5
paddb mm6,mm7
pand mm0,[eax].C0F
pand mm2,[eax].C0F
pand mm4,[eax].C0F
pand mm6,[eax].C0F
pxor mm1,mm1
paddb mm6,mm6
paddb mm6,mm4
paddb mm6,mm6
paddb mm6,mm2
paddb mm6,mm6
paddb mm6,mm0
psadbw mm6,mm1 ; add all bytes
movd eax,mm6
}
}
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.