Computer Chess Club Archives


Search

Terms

Messages

Subject: It does have POPCNT

Author: Simon Finn

Date: 07:59:09 08/17/00

Go up one level in this thread


On August 16, 2000 at 13:11:04, Eugene Nalimov wrote:

>On August 15, 2000 at 23:27:46, Larry Griffiths wrote:
>
>>I looked through some literature for the Intel Itanium processor and did not see
>>any Bit Scan Forward or Bit Scan reverse instructions.  Does anyone know for
>>sure if the Itanium has these instructions or something like them.
>>My bitboard move generation would really suck without BSF or BSR.
>>
>>Larry.
>
>No. There are no such instructions. You have to use ifs, shifts, and lookup
>table. Here is the example -- FirstOne()/LastOne() that I wrote for IA-64
>version of Crafty (it is not in the official build yet):

The IA-64 architecture does have POPCNT, so it may be more efficient
to implement one of these (I can't work out which one without thinking
too hard!) by the standard trick of computing

    BITBOARD x = arg1 & (arg1 - 1);

then counting the 1 bits in x (using POPCNT)and adding/subtracting
whatever adjustment is necessary (requires thought again!).

All that should take only 4 sequential instructions.

Simon


>
>int FirstOne(BITBOARD arg1) {
>    unsigned  bias;
>
>    bias = 0;
>    if (arg1>>32)
>      arg1 >>= 32;
>    else
>      bias += 32;
>    if (arg1>>16)
>      arg1 >>= 16;
>    else
>      bias += 16;
>    if (arg1 >> 8)
>      arg1 >>= 8;
>    else
>      bias += 8;
>    return (first_ones_8bit[arg1]+bias);
>}
>
>int LastOne(BITBOARD arg1) {
>    unsigned  bias;
>
>    bias = 0;
>    if ((unsigned) arg1) {
>      arg1 = (unsigned) arg1;
>      bias = 32;
>    }
>    else
>      arg1 >>= 32;
>    if (arg1&65535) {
>      arg1 &= 65535;
>      bias += 16;
>    }
>    else
>      arg1 >>= 16;
>    if (arg1&255) {
>        arg1 &= 255;
>        bias += 8;
>    }
>    else
>        arg1 >>= 8;
>    return (last_ones_8bit[arg1]+bias);
>}
>
>Here is output of 64-bit MSVC for those functions:
>
>// Begin code for function: FirstOne:
>	.proc	FirstOne#
>	.align 32
>FirstOne: 						// COMDAT
> {   .mii  //R-Addr: 0X00
>	mov	r29=r0					    //19.	cc:0
>	shr.u	r30=r32, 32				    //20.	cc:0
>	addl	r31=@ltoff(first_ones_8bit#),gp;;	    //32.	cc:0
> }
> {   .mmi  //R-Addr: 0X010
>	cmp.eq.unc p7,p6=r0, r30;;			    //20.	cc:1
>	ld8	r28=[r31]				    //32.	cc:2
>  (p6)	mov	r32=r30					    //21.	cc:2
> }
> {   .mmi  //R-Addr: 0X020
>  (p7)	mov	r29=32;;				    //23.	cc:2
>	nop.m	 0
>	shr.u	r27=r32, 16;;				    //24.	cc:3
> }
> {   .mmi  //R-Addr: 0X030
>	cmp.eq.unc p9,p8=r0, r27;;			    //24.	cc:4
>  (p8)	mov	r32=r27					    //25.	cc:5
>  (p9)	adds	r29=16, r29;;				    //27.	cc:5
> }
> {   .mii  //R-Addr: 0X040
>	nop.m	 0
>	shr.u	r26=r32, 8;;				    //28.	cc:6
>	cmp.eq.unc p11,p10=r0, r26;;			    //28.	cc:7
> }
> {   .mii  //R-Addr: 0X050
>  (p10)	mov	r32=r26					    //29.	cc:8
>  (p11)	adds	r29=8, r29;;				    //31.	cc:8
>	add	r25=r28, r32;;				    //32.	cc:9
> }
> {   .mmi  //R-Addr: 0X060
>	ld1	r22=[r25];;				    //32.	cc:11
>	add	r8=r22, r29				    //32.	cc:13
>	nop.i	 0
> }
> {   .mfb  //R-Addr: 0X070
>	nop.m	 0
>	nop.f	 0
>	br.ret.sptk.few b0;;				    //33 	cc:13
> }
>
>// Begin code for function: LastOne:
>	.proc	LastOne#
>	.align 32
>LastOne:	 						// COMDAT
>// File c:\crafty\vcinline.h
> {   .mii  //R-Addr: 0X00
>	cmp4.eq.unc p7,p6=r0, r32			    //39.	cc:0
>	mov	r30=65535				    //45.	cc:0 	mov	r29=255					    //51.	cc:0
> }
> {   .mmi  //R-Addr: 0X010
>	mov	r28=r0;;				    //38.	cc:0
>	addl	r31=@ltoff(last_ones_8bit#),gp		    //57.	cc:1
>  (p7)	shr.u	r27=r32, 32				    //44.	cc:1
> }
> {   .mii  //R-Addr: 0X020
>  (p6)	mov	r28=32					    //41.	cc:1
>  (p6)	zxt4	r27=r32;;				    //40.	cc:1
>	and	r25=r30, r27				    //45.	cc:2
> }
> {   .mmi  //R-Addr: 0X030
>	ld8	r26=[r31];;				    //57.	cc:2
>	cmp.eq.unc p9,p8=r0, r25			    //45.	cc:3
>	nop.i	 0;;
> }
> {   .mii  //R-Addr: 0X040
>  (p8)	mov	r22=r25					    //46.	cc:4
>  (p9)	shr.u	r22=r27, 16				    //50.	cc:4
>  (p8)	adds	r28=16, r28;;				    //47.	cc:4
> }
> {   .mmi  //R-Addr: 0X050
>	and	r21=r29, r22;;				    //51.	cc:5
>	cmp.eq.unc p11,p10=r0, r21			    //51.	cc:6
>	nop.i	 0;;
> }
> {   .mii  //R-Addr: 0X060
>  (p10)	mov	r20=r21					    //52.	cc:7
>  (p11)	shr.u	r20=r22, 8				    //56.	cc:7
>  (p10)	adds	r28=8, r28;;				    //53.	cc:7
> }
> {   .mmi  //R-Addr: 0X070
>	add	r19=r26, r20;;				    //57.	cc:8
>	ld1	r18=[r19]				    //57.	cc:9
>	nop.i	 0;;
> }
> {   .mfb  //R-Addr: 0X080
>	add	r8=r18, r28				    //57.	cc:11
>	nop.f	 0
>	br.ret.sptk.few b0;;				    //58 	cc:11
> }
>
>Please notice that there are no branches -- IA-64 allows to use predicated
>instructions instead. So no branch mispredict penalty. Assuming that both
>256-bytes tables are in the L1 cache, FirstOne() is executed in 14 clock cycles,
>LastOne() in 12.
>
>Eugene



This page took 0 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.