Author: Dezhi Zhao
Date: 12:08:37 09/04/03
Go up one level in this thread
Yes, SSE could beat the regular by a small margin.
Test hardware P4 1.6G Intel M/B FSB400 SDRAM
Test run results:
&old_key = 40d2d0, &new_key = 40d2c0, &type_rnd = 40bac0
old_key = 18be678400294823, new_key = 4512153f17260b03, c++ = 31s
old_key = 18be678400294823, new_key = 4512153f17260b03, asm = 25s
old_key = 18be678400294823, new_key = 4512153f17260b03, sse = 23s
// sse.cpp
//
#include "stdafx.h"
#include <stdlib.h>
#include <time.h>
#define COUNT_RUN 2500000000
__declspec(align(16)) unsigned __int64 old_key;
__declspec(align(16)) unsigned __int64 new_key;
__declspec(align(16)) unsigned __int64 type_rnd[12][64]; // indexed by
[type][square]
struct Move
{
unsigned char from; // from square
unsigned char to; // to square
unsigned char type; // piece type
unsigned char pad; // unused
Move(int move) { *((int*) this) = move; }; // a simple converter
};
void __fastcall update_key_non_capture(int move)
{
Move m = move; // cast back to convenient form
new_key = old_key ^ type_rnd[move >> 16][m.from] ^ type_rnd[move >> 16][m.to];
}
__declspec(naked) void __fastcall update_key_non_capture_asm(int move)
{
__asm
{
movzx eax, cl // from
movzx edx, ch // to
shr ecx, 10 // type * 64
and ecx, ~63 // mask off
add eax, ecx // type from index
add edx, ecx // type to index
mov ecx, dword ptr [old_key] // old low32
xor ecx, dword ptr type_rnd[eax*8] // from low32
mov eax, dword ptr type_rnd[eax*8+4] // from high32
xor ecx, dword ptr type_rnd[edx*8] // xor to low32
xor eax, dword ptr type_rnd[edx*8+4] // xor to high32
xor eax, dword ptr [old_key + 4] // old high32
mov dword ptr [new_key], ecx // store low32
mov dword ptr [new_key+4], eax // store high32
ret
}
}
__declspec(naked) void __fastcall update_key_non_capture_sse(int move)
{
__asm
{
movzx eax, cl // from
movzx edx, ch // to
shr ecx, 10 // type * 64
and ecx, ~63 // mask off
movaps xmm2, [old_key] // old_key 128
add eax, ecx // type from index
add edx, ecx // type to index
movups xmm0, type_rnd[eax*8] // from 128
movups xmm1, type_rnd[edx*8] // to 128
xorps xmm0, xmm2
xorps xmm0, xmm1
movlps [new_key], xmm0 // store 64
ret
}
}
__int64 rand64()
{
union
{
__int64 q;
struct
{
int low;
int high;
};
} r;
r.low = rand() | (rand() << 16);
r.high = rand() | (rand() << 16);
return r.q;
};
void init()
{
old_key = rand64();
for (int i = 0; i < 12; i++)
for (int j = 0; j < 64; j++)
type_rnd[i][j] = rand64();
};
int main(int argc, char* argv[])
{
time_t t0, t1;
int i;
printf("&old_key = %x, &new_key = %x, &type_rnd = %x\n",
&old_key, &new_key, &type_rnd[0][0]);
init();
time(&t0);
for (i = 0; i < COUNT_RUN; i++)
update_key_non_capture((12) | (28 << 8) | (0 << 16)); // test e4
time(&t1);
printf("old_key = %I64x, new_key = %I64x, c++ = %ds\n", old_key, new_key, t1 -
t0);
time(&t0);
for (i = 0; i < COUNT_RUN; i++)
update_key_non_capture_asm((12) | (28 << 8) | (0 << 16)); // test e4
time(&t1);
printf("old_key = %I64x, new_key = %I64x, asm = %ds\n", old_key, new_key, t1 -
t0);
time(&t0);
for (i = 0; i < COUNT_RUN; i++)
update_key_non_capture_sse((12) | (28 << 8) | (0 << 16)); // test e4
time(&t1);
printf("old_key = %I64x, new_key = %I64x, sse = %ds\n", old_key, new_key, t1 -
t0);
return 0;
}
This page took 0.01 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.