Author: Gerd Isenberg
Date: 14:25:38 09/04/03
Go up one level in this thread
On September 04, 2003 at 15:08:37, Dezhi Zhao wrote:
>Yes, SSE could beat the regular by a small margin.
Great - but try it in a real chess program.
In these loop tests some internal unrolling may occur, with "renamed" register
sets. I guess the loop-body including the called SSE-routine fits in P4's trace
cache, and two (or more) bodies are executed simultaniusly.
With the gp-register approach this is only partially possible, due to registers
are changed several times inside the body.
Anyway not that bad for P4, considering only 64-bits of 128 used per
XMM-register. What about MMX on P4 and what about SSE2-integer instructions,
movdqa, movdqu, movd and pxor? Ok one byte more opcode, but shorter latencies
(at least on opteron for movdqa and pxor). I do not have P4 SSE-instruction
latencies, but for opteron, mov unaligned is a killer due to vector path
instruction, movups as well as movdqu.
I'm intereseted in the assembler output of your C-routine - a bit stange that it
performs so "badly".
Regards,
Gerd
>
>Test hardware P4 1.6G Intel M/B FSB400 SDRAM
>Test run results:
>&old_key = 40d2d0, &new_key = 40d2c0, &type_rnd = 40bac0
>old_key = 18be678400294823, new_key = 4512153f17260b03, c++ = 31s
>old_key = 18be678400294823, new_key = 4512153f17260b03, asm = 25s
>old_key = 18be678400294823, new_key = 4512153f17260b03, sse = 23s
>
>
>// sse.cpp
>//
>
>#include "stdafx.h"
>#include <stdlib.h>
>#include <time.h>
>
>#define COUNT_RUN 2500000000
>
>__declspec(align(16)) unsigned __int64 old_key;
>__declspec(align(16)) unsigned __int64 new_key;
>__declspec(align(16)) unsigned __int64 type_rnd[12][64]; // indexed by
>[type][square]
>
>struct Move
>{
> unsigned char from; // from square
> unsigned char to; // to square
> unsigned char type; // piece type
> unsigned char pad; // unused
>
> Move(int move) { *((int*) this) = move; }; // a simple converter
>};
>
>void __fastcall update_key_non_capture(int move)
>{
> Move m = move; // cast back to convenient form
>
> new_key = old_key ^ type_rnd[move >> 16][m.from] ^ type_rnd[move >> 16][m.to];
>}
>
>
>__declspec(naked) void __fastcall update_key_non_capture_asm(int move)
>{
> __asm
> {
> movzx eax, cl // from
> movzx edx, ch // to
> shr ecx, 10 // type * 64
> and ecx, ~63 // mask off
> add eax, ecx // type from index
> add edx, ecx // type to index
> mov ecx, dword ptr [old_key] // old low32
> xor ecx, dword ptr type_rnd[eax*8] // from low32
> mov eax, dword ptr type_rnd[eax*8+4] // from high32
> xor ecx, dword ptr type_rnd[edx*8] // xor to low32
> xor eax, dword ptr type_rnd[edx*8+4] // xor to high32
> xor eax, dword ptr [old_key + 4] // old high32
>
> mov dword ptr [new_key], ecx // store low32
> mov dword ptr [new_key+4], eax // store high32
> ret
> }
>}
>
>__declspec(naked) void __fastcall update_key_non_capture_sse(int move)
>{
> __asm
> {
> movzx eax, cl // from
> movzx edx, ch // to
> shr ecx, 10 // type * 64
> and ecx, ~63 // mask off
> movaps xmm2, [old_key] // old_key 128
>
> add eax, ecx // type from index
> add edx, ecx // type to index
>
> movups xmm0, type_rnd[eax*8] // from 128
> movups xmm1, type_rnd[edx*8] // to 128
> xorps xmm0, xmm2
> xorps xmm0, xmm1
>
> movlps [new_key], xmm0 // store 64
> ret
> }
>}
>
>__int64 rand64()
>{
> union
> {
> __int64 q;
> struct
> {
> int low;
> int high;
> };
> } r;
>
> r.low = rand() | (rand() << 16);
> r.high = rand() | (rand() << 16);
>
> return r.q;
>};
>
>void init()
>{
> old_key = rand64();
>
> for (int i = 0; i < 12; i++)
> for (int j = 0; j < 64; j++)
> type_rnd[i][j] = rand64();
>};
>
>
>int main(int argc, char* argv[])
>{
> time_t t0, t1;
> int i;
>
> printf("&old_key = %x, &new_key = %x, &type_rnd = %x\n",
> &old_key, &new_key, &type_rnd[0][0]);
> init();
>
> time(&t0);
> for (i = 0; i < COUNT_RUN; i++)
> update_key_non_capture((12) | (28 << 8) | (0 << 16)); // test e4
> time(&t1);
> printf("old_key = %I64x, new_key = %I64x, c++ = %ds\n", old_key, new_key, t1 -
>t0);
>
> time(&t0);
> for (i = 0; i < COUNT_RUN; i++)
> update_key_non_capture_asm((12) | (28 << 8) | (0 << 16)); // test e4
> time(&t1);
> printf("old_key = %I64x, new_key = %I64x, asm = %ds\n", old_key, new_key, t1 -
>t0);
>
> time(&t0);
> for (i = 0; i < COUNT_RUN; i++)
> update_key_non_capture_sse((12) | (28 << 8) | (0 << 16)); // test e4
> time(&t1);
> printf("old_key = %I64x, new_key = %I64x, sse = %ds\n", old_key, new_key, t1 -
>t0);
>
> return 0;
>}
This page took 0.01 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.