Author: Gerd Isenberg
Date: 13:53:04 12/21/05
Go up one level in this thread
or the other way around - cheaper loop and break condition, few bytes shorter
for all types with assignment (constructor) and binary and operator& defined.
template <class T, unsigned int nint, const int* i1, const int* i2, int* i3>
__forceinline
void andIntVector()
{
assert(nint % sizeof(T) == 0);
const T* t1 = (const T*)i1;
const T* t2 = (const T*)i2;
T* t3 = (T*)i3;
for (unsigned int i=(nint * sizeof(int))/sizeof(T) - 4; (int)i >= 0; i-=4) {
t3[i+3] = t1[i+3] & t2[i+3];
t3[i+2] = t1[i+2] & t2[i+2];
t3[i+1] = t1[i+1] & t2[i+1];
t3[i+0] = t1[i+0] & t2[i+0];
}
}
andIntVector<MMX, 2048, a1, a2, a3>();
_TEXT SEGMENT
??$andIntVector@UMMX@@$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ PROC ;
andIntVector<MMX,2048,&a1,&a2,&a3>, COMDAT
00000 33 c0 xor eax, eax
$LL3@andIntVect:
00002 0f 6f 80 f8 1f
00 00 movq mm0, MMWORD PTR ?a1@@3PAHA[eax+8184]
00009 0f 6f 88 f8 1f
00 00 movq mm1, MMWORD PTR ?a2@@3PAHA[eax+8184]
00010 0f db c1 pand mm0, mm1
00013 0f 7f 80 f8 1f
00 00 movq MMWORD PTR ?a3@@3PAHA[eax+8184], mm0
0001a 0f 6f 80 f0 1f
00 00 movq mm0, MMWORD PTR ?a1@@3PAHA[eax+8176]
00021 0f 6f 88 f0 1f
00 00 movq mm1, MMWORD PTR ?a2@@3PAHA[eax+8176]
00028 0f db c1 pand mm0, mm1
0002b 0f 7f 80 f0 1f
00 00 movq MMWORD PTR ?a3@@3PAHA[eax+8176], mm0
00032 0f 6f 80 e8 1f
00 00 movq mm0, MMWORD PTR ?a1@@3PAHA[eax+8168]
00039 0f 6f 88 e8 1f
00 00 movq mm1, MMWORD PTR ?a2@@3PAHA[eax+8168]
00040 0f db c1 pand mm0, mm1
00043 0f 7f 80 e8 1f
00 00 movq MMWORD PTR ?a3@@3PAHA[eax+8168], mm0
0004a 0f 6f 80 e0 1f
00 00 movq mm0, MMWORD PTR ?a1@@3PAHA[eax+8160]
00051 0f 6f 88 e0 1f
00 00 movq mm1, MMWORD PTR ?a2@@3PAHA[eax+8160]
00058 0f db c1 pand mm0, mm1
0005b 0f 7f 80 e0 1f
00 00 movq MMWORD PTR ?a3@@3PAHA[eax+8160], mm0
00062 83 e8 20 sub eax, 32 ; 00000020H
00065 3d 20 e0 ff ff cmp eax, -8160 ; ffffe020H
0006a 7d 96 jge SHORT $LL3@andIntVect
0006c c3 ret 0
??$andIntVector@UMMX@@$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ ENDP ;
andIntVector<MMX,2048,&a1,&a2,&a3>
_TEXT ENDS
andIntVector<XMM, 2048, a1, a2, a3>();
_TEXT ENDS
PUBLIC ??$andIntVector@VXMM@@$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ ;
andIntVector<XMM,2048,&a1,&a2,&a3>
; Function compile flags: /Ogtpy
; COMDAT ??$andIntVector@VXMM@@$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ
_TEXT SEGMENT
??$andIntVector@VXMM@@$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ PROC ;
andIntVector<XMM,2048,&a1,&a2,&a3>, COMDAT
00000 33 c0 xor eax, eax
00002 eb 0c 8d a4 24
00 00 00 00 eb
03 8d 49 00 npad 14
$LL3@andIntVect:
00010 66 0f 6f 88 f0
1f 00 00 movdqa xmm1, XMMWORD PTR ?a2@@3PAHA[eax+8176]
00018 66 0f 6f 80 f0
1f 00 00 movdqa xmm0, XMMWORD PTR ?a1@@3PAHA[eax+8176]
00020 66 0f db c1 pand xmm0, xmm1
00024 66 0f 6f 88 e0
1f 00 00 movdqa xmm1, XMMWORD PTR ?a2@@3PAHA[eax+8160]
0002c 66 0f 7f 80 f0
1f 00 00 movdqa XMMWORD PTR ?a3@@3PAHA[eax+8176], xmm0
00034 66 0f 6f 80 e0
1f 00 00 movdqa xmm0, XMMWORD PTR ?a1@@3PAHA[eax+8160]
0003c 66 0f db c1 pand xmm0, xmm1
00040 66 0f 6f 88 d0
1f 00 00 movdqa xmm1, XMMWORD PTR ?a2@@3PAHA[eax+8144]
00048 66 0f 7f 80 e0
1f 00 00 movdqa XMMWORD PTR ?a3@@3PAHA[eax+8160], xmm0
00050 66 0f 6f 80 d0
1f 00 00 movdqa xmm0, XMMWORD PTR ?a1@@3PAHA[eax+8144]
00058 66 0f db c1 pand xmm0, xmm1
0005c 66 0f 6f 88 c0
1f 00 00 movdqa xmm1, XMMWORD PTR ?a2@@3PAHA[eax+8128]
00064 66 0f 7f 80 d0
1f 00 00 movdqa XMMWORD PTR ?a3@@3PAHA[eax+8144], xmm0
0006c 66 0f 6f 80 c0
1f 00 00 movdqa xmm0, XMMWORD PTR ?a1@@3PAHA[eax+8128]
00074 66 0f db c1 pand xmm0, xmm1
00078 66 0f 7f 80 c0
1f 00 00 movdqa XMMWORD PTR ?a3@@3PAHA[eax+8128], xmm0
00080 83 e8 40 sub eax, 64 ; 00000040H
00083 3d 40 e0 ff ff cmp eax, -8128 ; ffffe040H
00088 7d 86 jge SHORT $LL3@andIntVect
0008a c3 ret 0
??$andIntVector@VXMM@@$0IAA@$1?a1@@3PAHA$1?a2@@3PAHA$1?a3@@3PAHA@@YAXXZ ENDP ;
andIntVector<XMM,2048,&a1,&a2,&a3>
_TEXT ENDS
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.