Computer Chess Club Archives


Search

Terms

Messages

Subject: Re: Here is some test data

Author: Gerd Isenberg

Date: 14:25:38 09/04/03

Go up one level in this thread


On September 04, 2003 at 15:08:37, Dezhi Zhao wrote:

>Yes, SSE could beat the regular by a small margin.

Great - but try it in a real chess program.

In these loop tests some internal unrolling may occur, with "renamed" register
sets. I guess the loop-body including the called SSE-routine fits in P4's trace
cache, and two (or more) bodies are executed simultaniusly.

With the gp-register approach this is only partially possible, due to registers
are changed several times inside the body.

Anyway not that bad for P4, considering only 64-bits of 128 used per
XMM-register. What about MMX on P4 and what about SSE2-integer instructions,
movdqa, movdqu, movd and pxor? Ok one byte more opcode, but shorter latencies
(at least on opteron for movdqa and pxor). I do not have P4 SSE-instruction
latencies, but for opteron, mov unaligned is a killer due to vector path
instruction, movups as well as movdqu.

I'm intereseted in the assembler output of your C-routine - a bit stange that it
performs so "badly".

Regards,
Gerd


>
>Test hardware P4 1.6G Intel M/B FSB400 SDRAM
>Test run results:
>&old_key = 40d2d0, &new_key = 40d2c0,  &type_rnd = 40bac0
>old_key = 18be678400294823, new_key = 4512153f17260b03,  c++ = 31s
>old_key = 18be678400294823, new_key = 4512153f17260b03,  asm = 25s
>old_key = 18be678400294823, new_key = 4512153f17260b03,  sse = 23s
>
>
>// sse.cpp
>//
>
>#include "stdafx.h"
>#include <stdlib.h>
>#include <time.h>
>
>#define COUNT_RUN 2500000000
>
>__declspec(align(16)) unsigned __int64 old_key;
>__declspec(align(16)) unsigned __int64 new_key;
>__declspec(align(16)) unsigned __int64 type_rnd[12][64];	// indexed by
>[type][square]
>
>struct Move
>{
>	unsigned char from;		// from square
>	unsigned char to;		// to square
>	unsigned char type;		// piece type
>	unsigned char pad;		// unused
>
>	Move(int move) { *((int*) this) = move; };	// a simple converter
>};
>
>void __fastcall update_key_non_capture(int move)
>{
>	Move m  = move;			// cast back to convenient form
>
>	new_key = old_key ^ type_rnd[move >> 16][m.from] ^ type_rnd[move >> 16][m.to];
>}
>
>
>__declspec(naked) void __fastcall update_key_non_capture_asm(int move)
>{
>	__asm
>	{
>		movzx	eax, cl				// from
>		movzx	edx, ch				// to
>		shr ecx, 10				// type * 64
>		and ecx, ~63				// mask off
>		add eax, ecx				// type from index
>		add edx, ecx				// type to index
>		mov ecx, dword ptr [old_key]		// old low32
>		xor ecx, dword ptr type_rnd[eax*8]	// from low32
>		mov eax, dword ptr type_rnd[eax*8+4]	// from high32
>		xor ecx, dword ptr type_rnd[edx*8]	// xor to low32
>		xor eax, dword ptr type_rnd[edx*8+4]	// xor to high32
>		xor eax, dword ptr [old_key + 4]	// old high32
>
>		mov dword ptr [new_key], ecx		// store low32
>		mov dword ptr [new_key+4], eax		// store high32
>		ret
>	}
>}
>
>__declspec(naked) void __fastcall update_key_non_capture_sse(int move)
>{
>	__asm
>	{
>		movzx	eax, cl				// from
>		movzx	edx, ch				// to
>		shr ecx, 10				// type * 64
>		and	ecx, ~63			// mask off
>		movaps	xmm2, [old_key]			// old_key 128
>
>		add eax, ecx				// type from index
>		add edx, ecx				// type to index
>
>		movups	xmm0, type_rnd[eax*8]		// from 128
>		movups	xmm1, type_rnd[edx*8]		// to 128
>		xorps	xmm0, xmm2
>		xorps	xmm0, xmm1
>
>		movlps	[new_key], xmm0			// store 64
>		ret
>	}
>}
>
>__int64 rand64()
>{
>	union
>	{
>		__int64 q;
>		struct
>		{
>			int low;
>			int high;
>		};
>	} r;
>
>	r.low = rand() | (rand() << 16);
>	r.high = rand() | (rand() << 16);
>
>	return r.q;
>};
>
>void init()
>{
>	old_key = rand64();
>
>	for (int i = 0; i < 12; i++)
>		for (int j = 0; j < 64; j++)
>			type_rnd[i][j] = rand64();
>};
>
>
>int main(int argc, char* argv[])
>{
>	time_t t0, t1;
>	int i;
>
>	printf("&old_key = %x, &new_key = %x,  &type_rnd = %x\n",
>			&old_key, &new_key, &type_rnd[0][0]);
>	init();
>
>	time(&t0);
>	for (i = 0; i < COUNT_RUN; i++)
>		update_key_non_capture((12) | (28 << 8) | (0 << 16));	// test e4
>	time(&t1);
>	printf("old_key = %I64x, new_key = %I64x,  c++ = %ds\n", old_key, new_key, t1 -
>t0);
>
>	time(&t0);
>	for (i = 0; i < COUNT_RUN; i++)
>		update_key_non_capture_asm((12) | (28 << 8) | (0 << 16));	// test e4
>	time(&t1);
>	printf("old_key = %I64x, new_key = %I64x,  asm = %ds\n", old_key, new_key, t1 -
>t0);
>
>	time(&t0);
>	for (i = 0; i < COUNT_RUN; i++)
>		update_key_non_capture_sse((12) | (28 << 8) | (0 << 16));	// test e4
>	time(&t1);
>	printf("old_key = %I64x, new_key = %I64x,  sse = %ds\n", old_key, new_key, t1 -
>t0);
>
>	return 0;
>}



This page took 0.01 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.