Computer Chess Club Archives

Search

Terms
Messages

Subject: Re: What more is fast/slow?

Author: Tony Werten
Date: 08:04:30 08/18/00
On August 18, 2000 at 07:31:18, Vincent Diepeveen wrote:

>On August 17, 2000 at 08:29:05, Tony Werten wrote:
>
>>On August 16, 2000 at 13:11:04, Eugene Nalimov wrote:
>>
>>>On August 15, 2000 at 23:27:46, Larry Griffiths wrote:
>>>
>>>>I looked through some literature for the Intel Itanium processor and did not see
>>>>any Bit Scan Forward or Bit Scan reverse instructions.  Does anyone know for
>>>>sure if the Itanium has these instructions or something like them.
>>>>My bitboard move generation would really suck without BSF or BSR.
>>>>
>>>>Larry.
>>>
>>>No. There are no such instructions. You have to use ifs, shifts, and lookup
>>>table. Here is the example -- FirstOne()/LastOne() that I wrote for IA-64
>>>version of Crafty (it is not in the official build yet):
>>
>>Unfortunately shifts and integer multiplications are said to be 3 times as slow
>>on this new architecture ( makes you wonder how new it is ) At least that's what
>>i read today. If this is true, this processor will be a big dissapiontment for
>>chessprograms. ( well, it does run on 1.5 Ghz )
>
>How fast are array look ups on this Itanium,
>and how many instructions a clock can it do?

Array lookups need shifts so it can't be good. If i remember correctly it can do
4 instructions per clock. ( theoreticly, because the (in order ?) prediction
table can only supply 3 ) It is supposed to solve some of the problems with a 4
MB level 3 cache. ( Speed is unknown to me )

Tony

>
>>Tony
>>
>>>
>>>int FirstOne(BITBOARD arg1) {
>>>    unsigned  bias;
>>>
>>>    bias = 0;
>>>    if (arg1>>32)
>>>      arg1 >>= 32;
>>>    else
>>>      bias += 32;
>>>    if (arg1>>16)
>>>      arg1 >>= 16;
>>>    else
>>>      bias += 16;
>>>    if (arg1 >> 8)
>>>      arg1 >>= 8;
>>>    else
>>>      bias += 8;
>>>    return (first_ones_8bit[arg1]+bias);
>>>}
>>>
>>>int LastOne(BITBOARD arg1) {
>>>    unsigned  bias;
>>>
>>>    bias = 0;
>>>    if ((unsigned) arg1) {
>>>      arg1 = (unsigned) arg1;
>>>      bias = 32;
>>>    }
>>>    else
>>>      arg1 >>= 32;
>>>    if (arg1&65535) {
>>>      arg1 &= 65535;
>>>      bias += 16;
>>>    }
>>>    else
>>>      arg1 >>= 16;
>>>    if (arg1&255) {
>>>        arg1 &= 255;
>>>        bias += 8;
>>>    }
>>>    else
>>>        arg1 >>= 8;
>>>    return (last_ones_8bit[arg1]+bias);
>>>}
>>>
>>>Here is output of 64-bit MSVC for those functions:
>>>
>>>// Begin code for function: FirstOne:
>>>	.proc	FirstOne#
>>>	.align 32
>>>FirstOne: 						// COMDAT
>>> {   .mii  //R-Addr: 0X00
>>>	mov	r29=r0					    //19.	cc:0
>>>	shr.u	r30=r32, 32				    //20.	cc:0
>>>	addl	r31=@ltoff(first_ones_8bit#),gp;;	    //32.	cc:0
>>> }
>>> {   .mmi  //R-Addr: 0X010
>>>	cmp.eq.unc p7,p6=r0, r30;;			    //20.	cc:1
>>>	ld8	r28=[r31]				    //32.	cc:2
>>>  (p6)	mov	r32=r30					    //21.	cc:2
>>> }
>>> {   .mmi  //R-Addr: 0X020
>>>  (p7)	mov	r29=32;;				    //23.	cc:2
>>>	nop.m	 0
>>>	shr.u	r27=r32, 16;;				    //24.	cc:3
>>> }
>>> {   .mmi  //R-Addr: 0X030
>>>	cmp.eq.unc p9,p8=r0, r27;;			    //24.	cc:4
>>>  (p8)	mov	r32=r27					    //25.	cc:5
>>>  (p9)	adds	r29=16, r29;;				    //27.	cc:5
>>> }
>>> {   .mii  //R-Addr: 0X040
>>>	nop.m	 0
>>>	shr.u	r26=r32, 8;;				    //28.	cc:6
>>>	cmp.eq.unc p11,p10=r0, r26;;			    //28.	cc:7
>>> }
>>> {   .mii  //R-Addr: 0X050
>>>  (p10)	mov	r32=r26					    //29.	cc:8
>>>  (p11)	adds	r29=8, r29;;				    //31.	cc:8
>>>	add	r25=r28, r32;;				    //32.	cc:9
>>> }
>>> {   .mmi  //R-Addr: 0X060
>>>	ld1	r22=[r25];;				    //32.	cc:11
>>>	add	r8=r22, r29				    //32.	cc:13
>>>	nop.i	 0
>>> }
>>> {   .mfb  //R-Addr: 0X070
>>>	nop.m	 0
>>>	nop.f	 0
>>>	br.ret.sptk.few b0;;				    //33 	cc:13
>>> }
>>>
>>>// Begin code for function: LastOne:
>>>	.proc	LastOne#
>>>	.align 32
>>>LastOne:	 						// COMDAT
>>>// File c:\crafty\vcinline.h
>>> {   .mii  //R-Addr: 0X00
>>>	cmp4.eq.unc p7,p6=r0, r32			    //39.	cc:0
>>>	mov	r30=65535				    //45.	cc:0 	mov	r29=255					    //51.	cc:0
>>> }
>>> {   .mmi  //R-Addr: 0X010
>>>	mov	r28=r0;;				    //38.	cc:0
>>>	addl	r31=@ltoff(last_ones_8bit#),gp		    //57.	cc:1
>>>  (p7)	shr.u	r27=r32, 32				    //44.	cc:1
>>> }
>>> {   .mii  //R-Addr: 0X020
>>>  (p6)	mov	r28=32					    //41.	cc:1
>>>  (p6)	zxt4	r27=r32;;				    //40.	cc:1
>>>	and	r25=r30, r27				    //45.	cc:2
>>> }
>>> {   .mmi  //R-Addr: 0X030
>>>	ld8	r26=[r31];;				    //57.	cc:2
>>>	cmp.eq.unc p9,p8=r0, r25			    //45.	cc:3
>>>	nop.i	 0;;
>>> }
>>> {   .mii  //R-Addr: 0X040
>>>  (p8)	mov	r22=r25					    //46.	cc:4
>>>  (p9)	shr.u	r22=r27, 16				    //50.	cc:4
>>>  (p8)	adds	r28=16, r28;;				    //47.	cc:4
>>> }
>>> {   .mmi  //R-Addr: 0X050
>>>	and	r21=r29, r22;;				    //51.	cc:5
>>>	cmp.eq.unc p11,p10=r0, r21			    //51.	cc:6
>>>	nop.i	 0;;
>>> }
>>> {   .mii  //R-Addr: 0X060
>>>  (p10)	mov	r20=r21					    //52.	cc:7
>>>  (p11)	shr.u	r20=r22, 8				    //56.	cc:7
>>>  (p10)	adds	r28=8, r28;;				    //53.	cc:7
>>> }
>>> {   .mmi  //R-Addr: 0X070
>>>	add	r19=r26, r20;;				    //57.	cc:8
>>>	ld1	r18=[r19]				    //57.	cc:9
>>>	nop.i	 0;;
>>> }
>>> {   .mfb  //R-Addr: 0X080
>>>	add	r8=r18, r28				    //57.	cc:11
>>>	nop.f	 0
>>>	br.ret.sptk.few b0;;				    //58 	cc:11
>>> }
>>>
>>>Please notice that there are no branches -- IA-64 allows to use predicated
>>>instructions instead. So no branch mispredict penalty. Assuming that both
>>>256-bytes tables are in the L1 cache, FirstOne() is executed in 14 clock cycles,
>>>LastOne() in 12.
>>>
>>>Eugene
Re: What more is fast/slow? Vincent Diepeveen 14:55:13 08/18/00
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.