Computer Chess Club Archives


Search

Terms

Messages

Subject: Re: Nalimov: bsf/bsr intrinsics implementation still not optimal

Author: Eugene Nalimov

Date: 10:33:44 09/23/04

Go up one level in this thread


I worked on IPF/AMD64 implementation. I tried your test case, and here is what I
got on AMD64 using latest compiler:

PUBLIC	?testbsf@@YAHH@Z				; testbsf
; Function compile flags: /Ogtpy
_TEXT	SEGMENT
mask$ = 8
?testbsf@@YAHH@Z PROC					; testbsf
; File c:\repro\w.cpp
; Line 16
	xor	eax, eax
; Line 18
	test	ecx, ecx
	mov	r8d, ecx
	je	SHORT $LN7@testbsf
	npad	7
$LL2@testbsf:
; Line 21
	bsf	ecx, r8d
; Line 22
	mov	edx, 1
	shl	edx, cl
; Line 23
	or	eax, edx
; Line 24
	xor	r8d, edx
	jne	SHORT $LL2@testbsf
$LN7@testbsf:
; Line 28
	ret	0
?testbsf@@YAHH@Z ENDP					; testbsf

For x86 I got code that is still not optimal but is better than code in your
post:

PUBLIC	?testbsf@@YAHH@Z				; testbsf
; Function compile flags: /Ogtpy
_TEXT	SEGMENT
_index$13979 = 8					; size = 4
_mask$ = 8						; size = 4
?testbsf@@YAHH@Z PROC					; testbsf
; File c:\repro\w.cpp
; Line 14
	push	esi
; Line 18
	mov	esi, DWORD PTR _mask$[esp]
	xor	eax, eax
	test	esi, esi
	je	SHORT $LN1@testbsf
	npad	5
$LL2@testbsf:
; Line 21
	bsf	ecx, esi
; Line 22
	mov	edx, 1
	shl	edx, cl
	mov	DWORD PTR _index$13979[esp], ecx
; Line 23
	or	eax, edx
; Line 24
	xor	esi, edx
	jne	SHORT $LL2@testbsf
$LN1@testbsf:
	pop	esi
; Line 28
	ret	0
?testbsf@@YAHH@Z ENDP					; testbsf

I.e. we did not remove the store, but otherwise code is fine. I doubt we can
improve it for VS 2005, as all code quality work was shut down several months
ago.

For IPF I got

// Begin code for function: ?testbsf@@YAHH@Z:
	.proc	?testbsf@@YAHH@Z#
	.align 32
?testbsf@@YAHH@Z:
// mask$ = r32
// Output regs: None
// File c:\repro\w.cpp
 {   .mmi  //R-Addr: 0X00
	mov	r8=r0					    //16.	cc:0
	mov	r29=1					    //18.R	cc:0
	cmp4.eq.unc p15,p0=r0, r32			    //18.	cc:0
 }
 {   .mmb  //R-Addr: 0X010
	addp4	r31=r32, r0				    //21.R	cc:0
	nop.m	 0
  (p15)	br.ret.dpnt.many b0;;				    //18 	cc:0
	// taken 33341571(33%), not-taken 66658429(66%)
 }
$LL2@testbsf:						    // Start of loop
 {   .mmi  //R-Addr: 0X020
	adds	r30=-1, r31;;				    //21.	cc:0
	andcm	r28=r30, r31				    //21.	cc:1
	nop.i	 0;;
 }
 {   .mii  //R-Addr: 0X030
	nop.m	 0
	popcnt	r27=r28;;				    //21.	cc:4
	shl	r26=r29, r27;;				    //22.	cc:6
 }
 {   .mii  //R-Addr: 0X040
	xor	r32=r26, r32				    //24.	cc:9
	or	r8=r26, r8;;				    //23.	cc:9
	cmp4.ne.unc p15,p0=r0, r32			    //18.	cc:10
 }
 {   .mbb  //R-Addr: 0X050
	addp4	r31=r32, r0				    //21.	cc:10
  (p15)	br.cond.dptk.many $LL2@testbsf#			    //18.	cc:10
	// taken 88000000(88%), not-taken 12000000(12%)
	br.ret.sptk.many b0;;				    //28 	cc:10
 }

Thanks,
Eugene

On September 22, 2004 at 13:55:46, Dezhi Zhao wrote:

>I'm  really happy that bit operation instructions have become intrinsics for VC
>compiler in VS 2003 and later.
>
>However the output asm code is still not optimal. It generates a pair of
>redundant memory-register save and load instrucions. I also tested VC 2005
>Express beta1. The same thing again....
>
>Here is the asm output:
>
>PUBLIC	?testbsf@@YIHH@Z				; testbsf
>; Function compile flags: /Ogty
>; File c:\documents and settings\zhao\my documents\visual studio
>projects\bsf\bsf.cpp
>;	COMDAT ?testbsf@@YIHH@Z
>_TEXT	SEGMENT
>_index$9629 = -4					; size = 4
>?testbsf@@YIHH@Z PROC NEAR				; testbsf, COMDAT
>; _mask$ = ecx
>
>; 19   : {
>
>	push	ecx
>	push	esi
>	mov	esi, ecx
>
>; 20   : 	int m1;
>; 21   : 	int clone = 0;
>
>	xor	eax, eax
>
>; 22   :
>; 23   : 	while(mask)
>
>	test	esi, esi
>	je	SHORT $L9628
>	npad	6
>$L9627:
>
>; 24   : 	{
>; 25   : 		unsigned long index;
>; 26   : 		_BitScanForward(&index, mask);
>
>	bsf	edx, esi
>	lea	ecx, DWORD PTR _index$9629[esp+8]
>	mov	DWORD PTR [ecx], edx
>
>; 27   : 		m1 = 1 << index;
>
>	mov	ecx, DWORD PTR _index$9629[esp+8]
>	mov	edx, 1
>	shl	edx, cl
>
>; 28   : 		clone |= m1;
>
>	or	eax, edx
>
>; 29   : 		mask ^= m1;
>
>	xor	esi, edx
>	jne	SHORT $L9627
>$L9628:
>	pop	esi
>
>; 30   : 	}
>; 31   :
>; 32   : 	return clone;
>; 33   : }
>
>	pop	ecx
>	ret	0
>?testbsf@@YIHH@Z ENDP					; testbsf
>_TEXT	ENDS
>
>
>The test program:
>
>// bsf.cpp : Defines the entry point for the console application.
>//
>
>#include "stdafx.h"
>#include <iostream>
>using namespace std;
>
>extern "C"
>unsigned char _BitScanForward(unsigned long* index, unsigned long mask);
>
>#pragma intrinsic(_BitScanForward)
>
>int testbsf(int mask)
>{
>	int m1;
>	int clone = 0;
>
>	while(mask)
>	{
>		unsigned long index;
>		_BitScanForward(&index, mask);
>		m1 = 1 << index;
>		clone |= m1;
>		mask ^= m1;
>	}
>
>	return clone;
>}
>
>int _tmain(int argc, _TCHAR* argv[])
>{
>   unsigned long mask = 0x1000;
>
>   cout << "Enter a positive integer as the mask: " << flush;
>   cin >> mask;
>   cout << "Mask: " << mask << " Clone: " << testbsf(mask) << endl;
>
>   return 0;
>}



This page took 0 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.