Author: Eugene Nalimov
Date: 10:33:44 09/23/04
Go up one level in this thread
I worked on IPF/AMD64 implementation. I tried your test case, and here is what I
got on AMD64 using latest compiler:
PUBLIC ?testbsf@@YAHH@Z ; testbsf
; Function compile flags: /Ogtpy
_TEXT SEGMENT
mask$ = 8
?testbsf@@YAHH@Z PROC ; testbsf
; File c:\repro\w.cpp
; Line 16
xor eax, eax
; Line 18
test ecx, ecx
mov r8d, ecx
je SHORT $LN7@testbsf
npad 7
$LL2@testbsf:
; Line 21
bsf ecx, r8d
; Line 22
mov edx, 1
shl edx, cl
; Line 23
or eax, edx
; Line 24
xor r8d, edx
jne SHORT $LL2@testbsf
$LN7@testbsf:
; Line 28
ret 0
?testbsf@@YAHH@Z ENDP ; testbsf
For x86 I got code that is still not optimal but is better than code in your
post:
PUBLIC ?testbsf@@YAHH@Z ; testbsf
; Function compile flags: /Ogtpy
_TEXT SEGMENT
_index$13979 = 8 ; size = 4
_mask$ = 8 ; size = 4
?testbsf@@YAHH@Z PROC ; testbsf
; File c:\repro\w.cpp
; Line 14
push esi
; Line 18
mov esi, DWORD PTR _mask$[esp]
xor eax, eax
test esi, esi
je SHORT $LN1@testbsf
npad 5
$LL2@testbsf:
; Line 21
bsf ecx, esi
; Line 22
mov edx, 1
shl edx, cl
mov DWORD PTR _index$13979[esp], ecx
; Line 23
or eax, edx
; Line 24
xor esi, edx
jne SHORT $LL2@testbsf
$LN1@testbsf:
pop esi
; Line 28
ret 0
?testbsf@@YAHH@Z ENDP ; testbsf
I.e. we did not remove the store, but otherwise code is fine. I doubt we can
improve it for VS 2005, as all code quality work was shut down several months
ago.
For IPF I got
// Begin code for function: ?testbsf@@YAHH@Z:
.proc ?testbsf@@YAHH@Z#
.align 32
?testbsf@@YAHH@Z:
// mask$ = r32
// Output regs: None
// File c:\repro\w.cpp
{ .mmi //R-Addr: 0X00
mov r8=r0 //16. cc:0
mov r29=1 //18.R cc:0
cmp4.eq.unc p15,p0=r0, r32 //18. cc:0
}
{ .mmb //R-Addr: 0X010
addp4 r31=r32, r0 //21.R cc:0
nop.m 0
(p15) br.ret.dpnt.many b0;; //18 cc:0
// taken 33341571(33%), not-taken 66658429(66%)
}
$LL2@testbsf: // Start of loop
{ .mmi //R-Addr: 0X020
adds r30=-1, r31;; //21. cc:0
andcm r28=r30, r31 //21. cc:1
nop.i 0;;
}
{ .mii //R-Addr: 0X030
nop.m 0
popcnt r27=r28;; //21. cc:4
shl r26=r29, r27;; //22. cc:6
}
{ .mii //R-Addr: 0X040
xor r32=r26, r32 //24. cc:9
or r8=r26, r8;; //23. cc:9
cmp4.ne.unc p15,p0=r0, r32 //18. cc:10
}
{ .mbb //R-Addr: 0X050
addp4 r31=r32, r0 //21. cc:10
(p15) br.cond.dptk.many $LL2@testbsf# //18. cc:10
// taken 88000000(88%), not-taken 12000000(12%)
br.ret.sptk.many b0;; //28 cc:10
}
Thanks,
Eugene
On September 22, 2004 at 13:55:46, Dezhi Zhao wrote:
>I'm really happy that bit operation instructions have become intrinsics for VC
>compiler in VS 2003 and later.
>
>However the output asm code is still not optimal. It generates a pair of
>redundant memory-register save and load instrucions. I also tested VC 2005
>Express beta1. The same thing again....
>
>Here is the asm output:
>
>PUBLIC ?testbsf@@YIHH@Z ; testbsf
>; Function compile flags: /Ogty
>; File c:\documents and settings\zhao\my documents\visual studio
>projects\bsf\bsf.cpp
>; COMDAT ?testbsf@@YIHH@Z
>_TEXT SEGMENT
>_index$9629 = -4 ; size = 4
>?testbsf@@YIHH@Z PROC NEAR ; testbsf, COMDAT
>; _mask$ = ecx
>
>; 19 : {
>
> push ecx
> push esi
> mov esi, ecx
>
>; 20 : int m1;
>; 21 : int clone = 0;
>
> xor eax, eax
>
>; 22 :
>; 23 : while(mask)
>
> test esi, esi
> je SHORT $L9628
> npad 6
>$L9627:
>
>; 24 : {
>; 25 : unsigned long index;
>; 26 : _BitScanForward(&index, mask);
>
> bsf edx, esi
> lea ecx, DWORD PTR _index$9629[esp+8]
> mov DWORD PTR [ecx], edx
>
>; 27 : m1 = 1 << index;
>
> mov ecx, DWORD PTR _index$9629[esp+8]
> mov edx, 1
> shl edx, cl
>
>; 28 : clone |= m1;
>
> or eax, edx
>
>; 29 : mask ^= m1;
>
> xor esi, edx
> jne SHORT $L9627
>$L9628:
> pop esi
>
>; 30 : }
>; 31 :
>; 32 : return clone;
>; 33 : }
>
> pop ecx
> ret 0
>?testbsf@@YIHH@Z ENDP ; testbsf
>_TEXT ENDS
>
>
>The test program:
>
>// bsf.cpp : Defines the entry point for the console application.
>//
>
>#include "stdafx.h"
>#include <iostream>
>using namespace std;
>
>extern "C"
>unsigned char _BitScanForward(unsigned long* index, unsigned long mask);
>
>#pragma intrinsic(_BitScanForward)
>
>int testbsf(int mask)
>{
> int m1;
> int clone = 0;
>
> while(mask)
> {
> unsigned long index;
> _BitScanForward(&index, mask);
> m1 = 1 << index;
> clone |= m1;
> mask ^= m1;
> }
>
> return clone;
>}
>
>int _tmain(int argc, _TCHAR* argv[])
>{
> unsigned long mask = 0x1000;
>
> cout << "Enter a positive integer as the mask: " << flush;
> cin >> mask;
> cout << "Mask: " << mask << " Clone: " << testbsf(mask) << endl;
>
> return 0;
>}
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.