Author: Gerd Isenberg
Date: 11:57:01 12/21/05
Go up one level in this thread
with all compile time parameters: template <class T, unsigned int nint, const int* i1, const int* i2, int* i3> __forceinline void andIntVector() { const T* t1 = (const T*)i1; const T* t2 = (const T*)i2; T* t3 = (T*)i3; for (unsigned int i=0; i < (nint * sizeof(int))/sizeof(T); i+= 4) { t3[i+0] = t1[i+0] & t2[i+0]; t3[i+1] = t1[i+1] & t2[i+1]; t3[i+2] = t1[i+2] & t2[i+2]; t3[i+3] = t1[i+3] & t2[i+3]; } } a few 32-bit compiles with msvc2005 express: int XMM_ALIGN a1[2048]; int XMM_ALIGN a2[2048]; int XMM_ALIGN a3[2048]; --------------------------------------------------------------------------- andIntVector<XMM, 2048, a1, a2, a3>(); assembly looks nice, but imho too conserative register usage with this express compiler. I would prefere: 4*movdqa xmmi, [source1+reg] 4*pand xmmi, [source2+reg] 4*movdqa [target+reg], xmmi ??$andIntVector@VXMM@@$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ PROC ; andIntVector<XMM,2048,&a1,&a2,&a3>, COMDAT 00000 33 c0 xor eax, eax 00002 b9 80 00 00 00 mov ecx, 128 ; 00000080H 00007 eb 07 8d a4 24 00 00 00 00 npad 9 $LL3@andIntVect: 00010 66 0f 6f 88 00 00 00 00 movdqa xmm1, XMMWORD PTR ?a2@@3PAHA[eax] 00018 66 0f 6f 80 00 00 00 00 movdqa xmm0, XMMWORD PTR ?a1@@3PAHA[eax] 00020 66 0f db c1 pand xmm0, xmm1 00024 66 0f 6f 88 10 00 00 00 movdqa xmm1, XMMWORD PTR ?a2@@3PAHA[eax+16] 0002c 66 0f 7f 80 00 00 00 00 movdqa XMMWORD PTR ?a3@@3PAHA[eax], xmm0 00034 66 0f 6f 80 10 00 00 00 movdqa xmm0, XMMWORD PTR ?a1@@3PAHA[eax+16] 0003c 66 0f db c1 pand xmm0, xmm1 00040 66 0f 6f 88 20 00 00 00 movdqa xmm1, XMMWORD PTR ?a2@@3PAHA[eax+32] 00048 66 0f 7f 80 10 00 00 00 movdqa XMMWORD PTR ?a3@@3PAHA[eax+16], xmm0 00050 66 0f 6f 80 20 00 00 00 movdqa xmm0, XMMWORD PTR ?a1@@3PAHA[eax+32] 00058 66 0f db c1 pand xmm0, xmm1 0005c 66 0f 6f 88 30 00 00 00 movdqa xmm1, XMMWORD PTR ?a2@@3PAHA[eax+48] 00064 66 0f 7f 80 20 00 00 00 movdqa XMMWORD PTR ?a3@@3PAHA[eax+32], xmm0 0006c 66 0f 6f 80 30 00 00 00 movdqa xmm0, XMMWORD PTR ?a1@@3PAHA[eax+48] 00074 66 0f db c1 pand xmm0, xmm1 00078 66 0f 7f 80 30 00 00 00 movdqa XMMWORD PTR ?a3@@3PAHA[eax+48], xmm0 00080 83 c0 40 add eax, 64 ; 00000040H 00083 83 e9 01 sub ecx, 1 00086 75 88 jne SHORT $LL3@andIntVect 00088 c3 ret 0 ??$andIntVector@VXMM@@$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ ENDP ; andIntVector<XMM,2048,&a1,&a2,&a3> _TEXT ENDS --------------------------------------------------------------------------- andIntVector<int, 2048, a1, a2, a3>(); _TEXT SEGMENT ??$andIntVector@H$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ PROC ; andIntVector<int,2048,&a1,&a2,&a3>, COMDAT 00000 33 c0 xor eax, eax 00002 b9 00 02 00 00 mov ecx, 512 ; 00000200H 00007 eb 07 8d a4 24 00 00 00 00 npad 9 $LL3@andIntVect@4: 00010 8b 90 00 00 00 00 mov edx, DWORD PTR ?a2@@3PAHA[eax] 00016 23 90 00 00 00 00 and edx, DWORD PTR ?a1@@3PAHA[eax] 0001c 83 c0 10 add eax, 16 ; 00000010H 0001f 89 90 f0 ff ff ff mov DWORD PTR ?a3@@3PAHA[eax-16], edx 00025 8b 90 f4 ff ff ff mov edx, DWORD PTR ?a2@@3PAHA[eax-12] 0002b 23 90 f4 ff ff ff and edx, DWORD PTR ?a1@@3PAHA[eax-12] 00031 89 90 f4 ff ff ff mov DWORD PTR ?a3@@3PAHA[eax-12], edx 00037 8b 90 f8 ff ff ff mov edx, DWORD PTR ?a2@@3PAHA[eax-8] 0003d 23 90 f8 ff ff ff and edx, DWORD PTR ?a1@@3PAHA[eax-8] 00043 89 90 f8 ff ff ff mov DWORD PTR ?a3@@3PAHA[eax-8], edx 00049 8b 90 fc ff ff ff mov edx, DWORD PTR ?a2@@3PAHA[eax-4] 0004f 23 90 fc ff ff ff and edx, DWORD PTR ?a1@@3PAHA[eax-4] 00055 83 e9 01 sub ecx, 1 00058 89 90 fc ff ff ff mov DWORD PTR ?a3@@3PAHA[eax-4], edx 0005e 75 b0 jne SHORT $LL3@andIntVect@4 00060 c3 ret 0 ??$andIntVector@H$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ ENDP ; andIntVector<int,2048,&a1,&a2,&a3> _TEXT ENDS
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.