Author: Gerd Isenberg
Date: 11:57:01 12/21/05
Go up one level in this thread
with all compile time parameters:
template <class T, unsigned int nint, const int* i1, const int* i2, int* i3>
__forceinline
void andIntVector()
{
const T* t1 = (const T*)i1;
const T* t2 = (const T*)i2;
T* t3 = (T*)i3;
for (unsigned int i=0; i < (nint * sizeof(int))/sizeof(T); i+= 4) {
t3[i+0] = t1[i+0] & t2[i+0];
t3[i+1] = t1[i+1] & t2[i+1];
t3[i+2] = t1[i+2] & t2[i+2];
t3[i+3] = t1[i+3] & t2[i+3];
}
}
a few 32-bit compiles with msvc2005 express:
int XMM_ALIGN a1[2048];
int XMM_ALIGN a2[2048];
int XMM_ALIGN a3[2048];
---------------------------------------------------------------------------
andIntVector<XMM, 2048, a1, a2, a3>();
assembly looks nice, but imho too conserative register usage with this express
compiler. I would prefere:
4*movdqa xmmi, [source1+reg]
4*pand xmmi, [source2+reg]
4*movdqa [target+reg], xmmi
??$andIntVector@VXMM@@$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ PROC ;
andIntVector<XMM,2048,&a1,&a2,&a3>, COMDAT
00000 33 c0 xor eax, eax
00002 b9 80 00 00 00 mov ecx, 128 ; 00000080H
00007 eb 07 8d a4 24
00 00 00 00 npad 9
$LL3@andIntVect:
00010 66 0f 6f 88 00
00 00 00 movdqa xmm1, XMMWORD PTR ?a2@@3PAHA[eax]
00018 66 0f 6f 80 00
00 00 00 movdqa xmm0, XMMWORD PTR ?a1@@3PAHA[eax]
00020 66 0f db c1 pand xmm0, xmm1
00024 66 0f 6f 88 10
00 00 00 movdqa xmm1, XMMWORD PTR ?a2@@3PAHA[eax+16]
0002c 66 0f 7f 80 00
00 00 00 movdqa XMMWORD PTR ?a3@@3PAHA[eax], xmm0
00034 66 0f 6f 80 10
00 00 00 movdqa xmm0, XMMWORD PTR ?a1@@3PAHA[eax+16]
0003c 66 0f db c1 pand xmm0, xmm1
00040 66 0f 6f 88 20
00 00 00 movdqa xmm1, XMMWORD PTR ?a2@@3PAHA[eax+32]
00048 66 0f 7f 80 10
00 00 00 movdqa XMMWORD PTR ?a3@@3PAHA[eax+16], xmm0
00050 66 0f 6f 80 20
00 00 00 movdqa xmm0, XMMWORD PTR ?a1@@3PAHA[eax+32]
00058 66 0f db c1 pand xmm0, xmm1
0005c 66 0f 6f 88 30
00 00 00 movdqa xmm1, XMMWORD PTR ?a2@@3PAHA[eax+48]
00064 66 0f 7f 80 20
00 00 00 movdqa XMMWORD PTR ?a3@@3PAHA[eax+32], xmm0
0006c 66 0f 6f 80 30
00 00 00 movdqa xmm0, XMMWORD PTR ?a1@@3PAHA[eax+48]
00074 66 0f db c1 pand xmm0, xmm1
00078 66 0f 7f 80 30
00 00 00 movdqa XMMWORD PTR ?a3@@3PAHA[eax+48], xmm0
00080 83 c0 40 add eax, 64 ; 00000040H
00083 83 e9 01 sub ecx, 1
00086 75 88 jne SHORT $LL3@andIntVect
00088 c3 ret 0
??$andIntVector@VXMM@@$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ ENDP ;
andIntVector<XMM,2048,&a1,&a2,&a3>
_TEXT ENDS
---------------------------------------------------------------------------
andIntVector<int, 2048, a1, a2, a3>();
_TEXT SEGMENT
??$andIntVector@H$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ PROC ;
andIntVector<int,2048,&a1,&a2,&a3>, COMDAT
00000 33 c0 xor eax, eax
00002 b9 00 02 00 00 mov ecx, 512 ; 00000200H
00007 eb 07 8d a4 24
00 00 00 00 npad 9
$LL3@andIntVect@4:
00010 8b 90 00 00 00
00 mov edx, DWORD PTR ?a2@@3PAHA[eax]
00016 23 90 00 00 00
00 and edx, DWORD PTR ?a1@@3PAHA[eax]
0001c 83 c0 10 add eax, 16 ; 00000010H
0001f 89 90 f0 ff ff
ff mov DWORD PTR ?a3@@3PAHA[eax-16], edx
00025 8b 90 f4 ff ff
ff mov edx, DWORD PTR ?a2@@3PAHA[eax-12]
0002b 23 90 f4 ff ff
ff and edx, DWORD PTR ?a1@@3PAHA[eax-12]
00031 89 90 f4 ff ff
ff mov DWORD PTR ?a3@@3PAHA[eax-12], edx
00037 8b 90 f8 ff ff
ff mov edx, DWORD PTR ?a2@@3PAHA[eax-8]
0003d 23 90 f8 ff ff
ff and edx, DWORD PTR ?a1@@3PAHA[eax-8]
00043 89 90 f8 ff ff
ff mov DWORD PTR ?a3@@3PAHA[eax-8], edx
00049 8b 90 fc ff ff
ff mov edx, DWORD PTR ?a2@@3PAHA[eax-4]
0004f 23 90 fc ff ff
ff and edx, DWORD PTR ?a1@@3PAHA[eax-4]
00055 83 e9 01 sub ecx, 1
00058 89 90 fc ff ff
ff mov DWORD PTR ?a3@@3PAHA[eax-4], edx
0005e 75 b0 jne SHORT $LL3@andIntVect@4
00060 c3 ret 0
??$andIntVector@H$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ ENDP ;
andIntVector<int,2048,&a1,&a2,&a3>
_TEXT ENDS
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.