Author: Gerd Isenberg
Date: 14:58:02 07/24/04
Go up one level in this thread
Ok, the mmx version takes 95 code bytes, while the SSE2 one has 93 code bytes. The 64-mmx version has longer independent instruction chains and less effort to max horizontally. That explains the better mmx performance too. PUBLIC ?getMaxOf64@@YAHQAF@Z ; getMaxOf64 ; Function compile flags: /Ogty ; COMDAT ?getMaxOf64@@YAHQAF@Z _TEXT SEGMENT _s64$ = 8 ?getMaxOf64@@YAHQAF@Z PROC NEAR ; getMaxOf64, COMDAT 00000 8b 44 24 04 mov eax, DWORD PTR _s64$[esp-4] 00004 66 0f 6f 08 movdqa xmm1, XMMWORD PTR [eax] 00008 66 0f 6f 40 10 movdqa xmm0, XMMWORD PTR [eax+16] 0000d 66 0f ee 48 20 pmaxsw xmm1, XMMWORD PTR [eax+32] 00012 66 0f ee 40 30 pmaxsw xmm0, XMMWORD PTR [eax+48] 00017 66 0f ee 48 40 pmaxsw xmm1, XMMWORD PTR [eax+64] 0001c 66 0f ee 40 50 pmaxsw xmm0, XMMWORD PTR [eax+80] 00021 66 0f ee 48 60 pmaxsw xmm1, XMMWORD PTR [eax+96] 00026 66 0f ee 40 70 pmaxsw xmm0, XMMWORD PTR [eax+112] 0002b 66 0f ee c1 pmaxsw xmm0, xmm1 0002f 66 0f 6f c8 movdqa xmm1, xmm0 00033 66 0f 73 d8 02 pslrdq xmm0, 2 00038 66 0f ee c1 pmaxsw xmm0, xmm1 0003c 66 0f 6f c8 movdqa xmm1, xmm0 00040 66 0f 73 d8 04 pslrdq xmm0, 4 00045 66 0f ee c1 pmaxsw xmm0, xmm1 00049 66 0f 6f c8 movdqa xmm1, xmm0 0004d 66 0f 73 d8 08 pslrdq xmm0, 8 00052 66 0f ee c1 pmaxsw xmm0, xmm1 00056 66 0f c5 c0 00 pextrw eax, xmm0, 0 0005b 98 cwde 0005c c3 ret 0 ?getMaxOf64@@YAHQAF@Z ENDP ; getMaxOf64 _TEXT ENDS PUBLIC ?_getMaxOf64@@YAHQAF@Z ; _getMaxOf64 ; Function compile flags: /Ogty ; COMDAT ?_getMaxOf64@@YAHQAF@Z _TEXT SEGMENT _s64$ = 8 ?_getMaxOf64@@YAHQAF@Z PROC NEAR ; _getMaxOf64, COMDAT 00000 8b 44 24 04 mov eax, DWORD PTR _s64$[esp-4] 00004 0f 6f 08 movq mm1, MMWORD PTR [eax] 00007 0f 6f 40 08 movq mm0, MMWORD PTR [eax+8] 0000b 0f ee 48 10 pmaxsw mm1, MMWORD PTR [eax+16] 0000f 0f ee 40 18 pmaxsw mm0, MMWORD PTR [eax+24] 00013 0f ee 48 20 pmaxsw mm1, MMWORD PTR [eax+32] 00017 0f ee 40 28 pmaxsw mm0, MMWORD PTR [eax+40] 0001b 0f ee 48 30 pmaxsw mm1, MMWORD PTR [eax+48] 0001f 0f ee 40 38 pmaxsw mm0, MMWORD PTR [eax+56] 00023 0f ee 48 40 pmaxsw mm1, MMWORD PTR [eax+64] 00027 0f ee 40 48 pmaxsw mm0, MMWORD PTR [eax+72] 0002b 0f ee 48 50 pmaxsw mm1, MMWORD PTR [eax+80] 0002f 0f ee 40 58 pmaxsw mm0, MMWORD PTR [eax+88] 00033 0f ee 48 60 pmaxsw mm1, MMWORD PTR [eax+96] 00037 0f ee 40 68 pmaxsw mm0, MMWORD PTR [eax+104] 0003b 0f ee 48 70 pmaxsw mm1, MMWORD PTR [eax+112] 0003f 0f ee 40 78 pmaxsw mm0, MMWORD PTR [eax+120] 00043 0f ee c1 pmaxsw mm0, mm1 00046 0f 7f c1 movq mm1, mm0 00049 0f 73 d0 10 psrlq mm0, 16 ; 00000010H 0004d 0f ee c1 pmaxsw mm0, mm1 00050 0f 7f c1 movq mm1, mm0 00053 0f 73 d0 20 psrlq mm0, 32 ; 00000020H 00057 0f ee c1 pmaxsw mm0, mm1 0005a 0f 7e c0 movd eax, mm0 0005d 98 cwde 0005e c3 ret 0 ?_getMaxOf64@@YAHQAF@Z ENDP ; _getMaxOf64 _TEXT ENDS
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.