Author: Eugene Nalimov
Date: 10:33:44 09/23/04
Go up one level in this thread
I worked on IPF/AMD64 implementation. I tried your test case, and here is what I got on AMD64 using latest compiler: PUBLIC ?testbsf@@YAHH@Z ; testbsf ; Function compile flags: /Ogtpy _TEXT SEGMENT mask$ = 8 ?testbsf@@YAHH@Z PROC ; testbsf ; File c:\repro\w.cpp ; Line 16 xor eax, eax ; Line 18 test ecx, ecx mov r8d, ecx je SHORT $LN7@testbsf npad 7 $LL2@testbsf: ; Line 21 bsf ecx, r8d ; Line 22 mov edx, 1 shl edx, cl ; Line 23 or eax, edx ; Line 24 xor r8d, edx jne SHORT $LL2@testbsf $LN7@testbsf: ; Line 28 ret 0 ?testbsf@@YAHH@Z ENDP ; testbsf For x86 I got code that is still not optimal but is better than code in your post: PUBLIC ?testbsf@@YAHH@Z ; testbsf ; Function compile flags: /Ogtpy _TEXT SEGMENT _index$13979 = 8 ; size = 4 _mask$ = 8 ; size = 4 ?testbsf@@YAHH@Z PROC ; testbsf ; File c:\repro\w.cpp ; Line 14 push esi ; Line 18 mov esi, DWORD PTR _mask$[esp] xor eax, eax test esi, esi je SHORT $LN1@testbsf npad 5 $LL2@testbsf: ; Line 21 bsf ecx, esi ; Line 22 mov edx, 1 shl edx, cl mov DWORD PTR _index$13979[esp], ecx ; Line 23 or eax, edx ; Line 24 xor esi, edx jne SHORT $LL2@testbsf $LN1@testbsf: pop esi ; Line 28 ret 0 ?testbsf@@YAHH@Z ENDP ; testbsf I.e. we did not remove the store, but otherwise code is fine. I doubt we can improve it for VS 2005, as all code quality work was shut down several months ago. For IPF I got // Begin code for function: ?testbsf@@YAHH@Z: .proc ?testbsf@@YAHH@Z# .align 32 ?testbsf@@YAHH@Z: // mask$ = r32 // Output regs: None // File c:\repro\w.cpp { .mmi //R-Addr: 0X00 mov r8=r0 //16. cc:0 mov r29=1 //18.R cc:0 cmp4.eq.unc p15,p0=r0, r32 //18. cc:0 } { .mmb //R-Addr: 0X010 addp4 r31=r32, r0 //21.R cc:0 nop.m 0 (p15) br.ret.dpnt.many b0;; //18 cc:0 // taken 33341571(33%), not-taken 66658429(66%) } $LL2@testbsf: // Start of loop { .mmi //R-Addr: 0X020 adds r30=-1, r31;; //21. cc:0 andcm r28=r30, r31 //21. cc:1 nop.i 0;; } { .mii //R-Addr: 0X030 nop.m 0 popcnt r27=r28;; //21. cc:4 shl r26=r29, r27;; //22. cc:6 } { .mii //R-Addr: 0X040 xor r32=r26, r32 //24. cc:9 or r8=r26, r8;; //23. cc:9 cmp4.ne.unc p15,p0=r0, r32 //18. cc:10 } { .mbb //R-Addr: 0X050 addp4 r31=r32, r0 //21. cc:10 (p15) br.cond.dptk.many $LL2@testbsf# //18. cc:10 // taken 88000000(88%), not-taken 12000000(12%) br.ret.sptk.many b0;; //28 cc:10 } Thanks, Eugene On September 22, 2004 at 13:55:46, Dezhi Zhao wrote: >I'm really happy that bit operation instructions have become intrinsics for VC >compiler in VS 2003 and later. > >However the output asm code is still not optimal. It generates a pair of >redundant memory-register save and load instrucions. I also tested VC 2005 >Express beta1. The same thing again.... > >Here is the asm output: > >PUBLIC ?testbsf@@YIHH@Z ; testbsf >; Function compile flags: /Ogty >; File c:\documents and settings\zhao\my documents\visual studio >projects\bsf\bsf.cpp >; COMDAT ?testbsf@@YIHH@Z >_TEXT SEGMENT >_index$9629 = -4 ; size = 4 >?testbsf@@YIHH@Z PROC NEAR ; testbsf, COMDAT >; _mask$ = ecx > >; 19 : { > > push ecx > push esi > mov esi, ecx > >; 20 : int m1; >; 21 : int clone = 0; > > xor eax, eax > >; 22 : >; 23 : while(mask) > > test esi, esi > je SHORT $L9628 > npad 6 >$L9627: > >; 24 : { >; 25 : unsigned long index; >; 26 : _BitScanForward(&index, mask); > > bsf edx, esi > lea ecx, DWORD PTR _index$9629[esp+8] > mov DWORD PTR [ecx], edx > >; 27 : m1 = 1 << index; > > mov ecx, DWORD PTR _index$9629[esp+8] > mov edx, 1 > shl edx, cl > >; 28 : clone |= m1; > > or eax, edx > >; 29 : mask ^= m1; > > xor esi, edx > jne SHORT $L9627 >$L9628: > pop esi > >; 30 : } >; 31 : >; 32 : return clone; >; 33 : } > > pop ecx > ret 0 >?testbsf@@YIHH@Z ENDP ; testbsf >_TEXT ENDS > > >The test program: > >// bsf.cpp : Defines the entry point for the console application. >// > >#include "stdafx.h" >#include <iostream> >using namespace std; > >extern "C" >unsigned char _BitScanForward(unsigned long* index, unsigned long mask); > >#pragma intrinsic(_BitScanForward) > >int testbsf(int mask) >{ > int m1; > int clone = 0; > > while(mask) > { > unsigned long index; > _BitScanForward(&index, mask); > m1 = 1 << index; > clone |= m1; > mask ^= m1; > } > > return clone; >} > >int _tmain(int argc, _TCHAR* argv[]) >{ > unsigned long mask = 0x1000; > > cout << "Enter a positive integer as the mask: " << flush; > cin >> mask; > cout << "Mask: " << mask << " Clone: " << testbsf(mask) << endl; > > return 0; >}
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.