Author: Dezhi Zhao
Date: 12:08:37 09/04/03
Go up one level in this thread
Yes, SSE could beat the regular by a small margin. Test hardware P4 1.6G Intel M/B FSB400 SDRAM Test run results: &old_key = 40d2d0, &new_key = 40d2c0, &type_rnd = 40bac0 old_key = 18be678400294823, new_key = 4512153f17260b03, c++ = 31s old_key = 18be678400294823, new_key = 4512153f17260b03, asm = 25s old_key = 18be678400294823, new_key = 4512153f17260b03, sse = 23s // sse.cpp // #include "stdafx.h" #include <stdlib.h> #include <time.h> #define COUNT_RUN 2500000000 __declspec(align(16)) unsigned __int64 old_key; __declspec(align(16)) unsigned __int64 new_key; __declspec(align(16)) unsigned __int64 type_rnd[12][64]; // indexed by [type][square] struct Move { unsigned char from; // from square unsigned char to; // to square unsigned char type; // piece type unsigned char pad; // unused Move(int move) { *((int*) this) = move; }; // a simple converter }; void __fastcall update_key_non_capture(int move) { Move m = move; // cast back to convenient form new_key = old_key ^ type_rnd[move >> 16][m.from] ^ type_rnd[move >> 16][m.to]; } __declspec(naked) void __fastcall update_key_non_capture_asm(int move) { __asm { movzx eax, cl // from movzx edx, ch // to shr ecx, 10 // type * 64 and ecx, ~63 // mask off add eax, ecx // type from index add edx, ecx // type to index mov ecx, dword ptr [old_key] // old low32 xor ecx, dword ptr type_rnd[eax*8] // from low32 mov eax, dword ptr type_rnd[eax*8+4] // from high32 xor ecx, dword ptr type_rnd[edx*8] // xor to low32 xor eax, dword ptr type_rnd[edx*8+4] // xor to high32 xor eax, dword ptr [old_key + 4] // old high32 mov dword ptr [new_key], ecx // store low32 mov dword ptr [new_key+4], eax // store high32 ret } } __declspec(naked) void __fastcall update_key_non_capture_sse(int move) { __asm { movzx eax, cl // from movzx edx, ch // to shr ecx, 10 // type * 64 and ecx, ~63 // mask off movaps xmm2, [old_key] // old_key 128 add eax, ecx // type from index add edx, ecx // type to index movups xmm0, type_rnd[eax*8] // from 128 movups xmm1, type_rnd[edx*8] // to 128 xorps xmm0, xmm2 xorps xmm0, xmm1 movlps [new_key], xmm0 // store 64 ret } } __int64 rand64() { union { __int64 q; struct { int low; int high; }; } r; r.low = rand() | (rand() << 16); r.high = rand() | (rand() << 16); return r.q; }; void init() { old_key = rand64(); for (int i = 0; i < 12; i++) for (int j = 0; j < 64; j++) type_rnd[i][j] = rand64(); }; int main(int argc, char* argv[]) { time_t t0, t1; int i; printf("&old_key = %x, &new_key = %x, &type_rnd = %x\n", &old_key, &new_key, &type_rnd[0][0]); init(); time(&t0); for (i = 0; i < COUNT_RUN; i++) update_key_non_capture((12) | (28 << 8) | (0 << 16)); // test e4 time(&t1); printf("old_key = %I64x, new_key = %I64x, c++ = %ds\n", old_key, new_key, t1 - t0); time(&t0); for (i = 0; i < COUNT_RUN; i++) update_key_non_capture_asm((12) | (28 << 8) | (0 << 16)); // test e4 time(&t1); printf("old_key = %I64x, new_key = %I64x, asm = %ds\n", old_key, new_key, t1 - t0); time(&t0); for (i = 0; i < COUNT_RUN; i++) update_key_non_capture_sse((12) | (28 << 8) | (0 << 16)); // test e4 time(&t1); printf("old_key = %I64x, new_key = %I64x, sse = %ds\n", old_key, new_key, t1 - t0); return 0; }
This page took 0.01 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.