Computer Chess Club Archives


Search

Terms

Messages

Subject: Re: It does have POPCNT

Author: Eugene Nalimov

Date: 09:56:20 08/17/00

Go up one level in this thread


You are only partially right. Yes, there is 'popcnt' instruction. But it has
terrible latency -- and what's even worse, if programmer (or compiler) is not
cautious enough, latency is even higher (for detalis see "Itanium Processor
Microarchitecture Reference", Intel order number 245473-001; you can download it
from Intel web site for free).

Code for this version of LastOne() can be (meaningful instructions only,
omitting nops and do not bundling):

    sub     r8=r32, 1;;    //cc:0
    xor     r8=r8, r32;;   //cc:1
    shr.u   r8=r8, 1;;     //cc:2
    popcnt  r8=r8;;        //cc:5

And now the cuttest part: if you'll try to use the result too early (less than
in 4 clock cycles), you'll get pipeline stall, costing you 10 clock cycles
(problem desription oversimplified, but must be true for the chess program).

So that version of LastOne() in the best case will take 10 clock cycles, vs. 12
for C version. In the worst case (i.e. LastOne() was not inlined, you do not put
4 extra stop bits before return, and you'll try to use it result immediately
after call) it'll cost you 16 cycles.

Eugene

On August 17, 2000 at 11:37:48, Simon Finn wrote:

>On August 17, 2000 at 10:59:09, Simon Finn wrote:
>
>>On August 16, 2000 at 13:11:04, Eugene Nalimov wrote:
>>
>>>On August 15, 2000 at 23:27:46, Larry Griffiths wrote:
>>>
>>>>I looked through some literature for the Intel Itanium processor and did not see
>>>>any Bit Scan Forward or Bit Scan reverse instructions.  Does anyone know for
>>>>sure if the Itanium has these instructions or something like them.
>>>>My bitboard move generation would really suck without BSF or BSR.
>>>>
>>>>Larry.
>>>
>>>No. There are no such instructions. You have to use ifs, shifts, and lookup
>>>table. Here is the example -- FirstOne()/LastOne() that I wrote for IA-64
>>>version of Crafty (it is not in the official build yet):
>>
>>The IA-64 architecture does have POPCNT, so it may be more efficient
>>to implement one of these (I can't work out which one without thinking
>>too hard!) by the standard trick of computing
>>
>>    BITBOARD x = arg1 & (arg1 - 1);
>
>Whoops - "&" (AND) should be "^" (XOR).
>
>
>>
>>then counting the 1 bits in x (using POPCNT)and adding/subtracting
>>whatever adjustment is necessary (requires thought again!).
>
>I first saw this trick on the Dark Thought site.
>
>Simon
>
>>
>>All that should take only 4 sequential instructions.
>>
>>Simon
>>
>>
>>>
>>>int FirstOne(BITBOARD arg1) {
>>>    unsigned  bias;
>>>
>>>    bias = 0;
>>>    if (arg1>>32)
>>>      arg1 >>= 32;
>>>    else
>>>      bias += 32;
>>>    if (arg1>>16)
>>>      arg1 >>= 16;
>>>    else
>>>      bias += 16;
>>>    if (arg1 >> 8)
>>>      arg1 >>= 8;
>>>    else
>>>      bias += 8;
>>>    return (first_ones_8bit[arg1]+bias);
>>>}
>>>
>>>int LastOne(BITBOARD arg1) {
>>>    unsigned  bias;
>>>
>>>    bias = 0;
>>>    if ((unsigned) arg1) {
>>>      arg1 = (unsigned) arg1;
>>>      bias = 32;
>>>    }
>>>    else
>>>      arg1 >>= 32;
>>>    if (arg1&65535) {
>>>      arg1 &= 65535;
>>>      bias += 16;
>>>    }
>>>    else
>>>      arg1 >>= 16;
>>>    if (arg1&255) {
>>>        arg1 &= 255;
>>>        bias += 8;
>>>    }
>>>    else
>>>        arg1 >>= 8;
>>>    return (last_ones_8bit[arg1]+bias);
>>>}
>>>
>>>Here is output of 64-bit MSVC for those functions:
>>>
>>>// Begin code for function: FirstOne:
>>>	.proc	FirstOne#
>>>	.align 32
>>>FirstOne: 						// COMDAT
>>> {   .mii  //R-Addr: 0X00
>>>	mov	r29=r0					    //19.	cc:0
>>>	shr.u	r30=r32, 32				    //20.	cc:0
>>>	addl	r31=@ltoff(first_ones_8bit#),gp;;	    //32.	cc:0
>>> }
>>> {   .mmi  //R-Addr: 0X010
>>>	cmp.eq.unc p7,p6=r0, r30;;			    //20.	cc:1
>>>	ld8	r28=[r31]				    //32.	cc:2
>>>  (p6)	mov	r32=r30					    //21.	cc:2
>>> }
>>> {   .mmi  //R-Addr: 0X020
>>>  (p7)	mov	r29=32;;				    //23.	cc:2
>>>	nop.m	 0
>>>	shr.u	r27=r32, 16;;				    //24.	cc:3
>>> }
>>> {   .mmi  //R-Addr: 0X030
>>>	cmp.eq.unc p9,p8=r0, r27;;			    //24.	cc:4
>>>  (p8)	mov	r32=r27					    //25.	cc:5
>>>  (p9)	adds	r29=16, r29;;				    //27.	cc:5
>>> }
>>> {   .mii  //R-Addr: 0X040
>>>	nop.m	 0
>>>	shr.u	r26=r32, 8;;				    //28.	cc:6
>>>	cmp.eq.unc p11,p10=r0, r26;;			    //28.	cc:7
>>> }
>>> {   .mii  //R-Addr: 0X050
>>>  (p10)	mov	r32=r26					    //29.	cc:8
>>>  (p11)	adds	r29=8, r29;;				    //31.	cc:8
>>>	add	r25=r28, r32;;				    //32.	cc:9
>>> }
>>> {   .mmi  //R-Addr: 0X060
>>>	ld1	r22=[r25];;				    //32.	cc:11
>>>	add	r8=r22, r29				    //32.	cc:13
>>>	nop.i	 0
>>> }
>>> {   .mfb  //R-Addr: 0X070
>>>	nop.m	 0
>>>	nop.f	 0
>>>	br.ret.sptk.few b0;;				    //33 	cc:13
>>> }
>>>
>>>// Begin code for function: LastOne:
>>>	.proc	LastOne#
>>>	.align 32
>>>LastOne:	 						// COMDAT
>>>// File c:\crafty\vcinline.h
>>> {   .mii  //R-Addr: 0X00
>>>	cmp4.eq.unc p7,p6=r0, r32			    //39.	cc:0
>>>	mov	r30=65535				    //45.	cc:0 	mov	r29=255					    //51.	cc:0
>>> }
>>> {   .mmi  //R-Addr: 0X010
>>>	mov	r28=r0;;				    //38.	cc:0
>>>	addl	r31=@ltoff(last_ones_8bit#),gp		    //57.	cc:1
>>>  (p7)	shr.u	r27=r32, 32				    //44.	cc:1
>>> }
>>> {   .mii  //R-Addr: 0X020
>>>  (p6)	mov	r28=32					    //41.	cc:1
>>>  (p6)	zxt4	r27=r32;;				    //40.	cc:1
>>>	and	r25=r30, r27				    //45.	cc:2
>>> }
>>> {   .mmi  //R-Addr: 0X030
>>>	ld8	r26=[r31];;				    //57.	cc:2
>>>	cmp.eq.unc p9,p8=r0, r25			    //45.	cc:3
>>>	nop.i	 0;;
>>> }
>>> {   .mii  //R-Addr: 0X040
>>>  (p8)	mov	r22=r25					    //46.	cc:4
>>>  (p9)	shr.u	r22=r27, 16				    //50.	cc:4
>>>  (p8)	adds	r28=16, r28;;				    //47.	cc:4
>>> }
>>> {   .mmi  //R-Addr: 0X050
>>>	and	r21=r29, r22;;				    //51.	cc:5
>>>	cmp.eq.unc p11,p10=r0, r21			    //51.	cc:6
>>>	nop.i	 0;;
>>> }
>>> {   .mii  //R-Addr: 0X060
>>>  (p10)	mov	r20=r21					    //52.	cc:7
>>>  (p11)	shr.u	r20=r22, 8				    //56.	cc:7
>>>  (p10)	adds	r28=8, r28;;				    //53.	cc:7
>>> }
>>> {   .mmi  //R-Addr: 0X070
>>>	add	r19=r26, r20;;				    //57.	cc:8
>>>	ld1	r18=[r19]				    //57.	cc:9
>>>	nop.i	 0;;
>>> }
>>> {   .mfb  //R-Addr: 0X080
>>>	add	r8=r18, r28				    //57.	cc:11
>>>	nop.f	 0
>>>	br.ret.sptk.few b0;;				    //58 	cc:11
>>> }
>>>
>>>Please notice that there are no branches -- IA-64 allows to use predicated
>>>instructions instead. So no branch mispredict penalty. Assuming that both
>>>256-bytes tables are in the L1 cache, FirstOne() is executed in 14 clock cycles,
>>>LastOne() in 12.
>>>
>>>Eugene



This page took 0.01 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.