Computer Chess Club Archives

Search

Terms
Messages

Subject: Re: Crafty profits little from Itanium and Opteron versus Commercials

Author: Gerd Isenberg
Date: 13:12:28 08/09/03
On August 08, 2003 at 15:53:00, Eugene Nalimov wrote:

>On August 07, 2003 at 15:40:33, Gerd Isenberg wrote:
>
>>...
>>3. bsf, still vector path and 9 cycles.
>
>Hmm, on Itanium2 I can do BSF/BSR equivalent in 8/9 clocks without BSF/BSR
>instructions:

Hi Eugene,

I see, Itanium2 has popcount, thanks for the lession.
I refered to opteron's bsf, but anyway, nice lession how to use intrinsics.
My bitscan collection grows and grows ;-)

I'm interested in performance of opterons bsf/btr instructions versus
Matt Taylor's approach with 64-bit magic de Bruijn multiplication, may be even
without lookup:

typedef unsigned long long BitBoard;

int bitScanAndReset(BitBoard &bb)
{
	BitBoard lsb = bb ^ (bb - 1);
	bb &= ~lsb;
	return tbl64[(lsb * 0x03f08c5392cd5dbd) >> (64 - 6)];
}

I guess a 32-bit folded version is inferiour on opeteron, because there is a
real shift >> 32 with 64 bit registers.

int bitScanAndReset(BitBoard &bb)
{
	BitBoard lsb = bb ^ (bb - 1);
	bb &= ~lsb;
	unsigned __int32 foldedLSB = ((int) lsb) ^ ((int)(lsb>>32));
	return lsz64_tbl[(foldedLSB * 0x78291ACF) >> (32-6)];
}

Thanks,
Gerd

>
>--------------------------------------------------------------------------
>
>union _m64 {
>    unsigned long long u64;
>    // Other stuff deleted as irrelevant
>};
>
>extern union _m64 __m64_popcnt(unsigned long long);
>#pragma intrinsic(__m64_popcnt)
>
>unsigned long long BSF (unsigned long long x)
>{
>    union _m64 y;
>
>    y = __m64_popcnt(x^(x-1));
>    return y.u64-1;
>}
>
>extern unsigned char u8bit[256];
>unsigned long long BSR (unsigned long long x)
>{
>    unsigned long long r, s, t, u, w, y, z;
>    unsigned long long result;
>    unsigned char *p;
>
>    y = x >> 32;
>    p = u8bit;
>    if (y == 0) {
>        t = x;
>        u = (unsigned short) x;
>        result = 0;
>    }
>    else {
>        t = y;
>        u = (unsigned short) y;
>        result = 32;
>    }
>    w = t >> 16;
>    if (u == t)
>        r = (unsigned char) t;
>    else {
>        t = w;
>        r = (unsigned char) w;
>        result += 16;
>    }
>    s = t >> 8;
>    if (t == r)
>        p += t;
>    else {
>        p += s;
>        result += 8;
>    }
>    return result + *p;
>}
>
>--------------------------------------------------------------------------
>
>// Listing generated by Microsoft (R) Optimizing Compiler Version 14.00.30807
>
>	.file	"C:/repro/bb.c"
>	.radix	D
>	.section	.text,	"ax", "progbits"
>	.align 32
>	.section	.pdata,	"a", "progbits"
>	.align 4
>	.section	.xdata,	"a", "progbits"
>	.align 8
>	.section	.data,	"wa", "progbits"
>	.align 16
>	.section	.rdata,	"a", "progbits"
>	.align 16
>	.section	.bss,	"wa", "nobits"
>	.align 16
>	.section	.debug$S,	"ax", "progbits"
>	.align 16
>	.section	.tls$,	"was", "progbits"
>	.align 16
>	.section	.sdata,	"was", "progbits"
>	.align 16
>	.section	.sbss,	"was", "nobits"
>	.align 16
>	.section	.srdata,	"as", "progbits"
>	.align 16
>	.section	.rdata,	"a", "progbits"
>	.align 16
>	.type	BSF#	,@function
>        .global BSF#
>// Function compile flags: /Ogtp
>	.section	.text
>
>// Begin code for function: BSF:
>	.proc	BSF#
>	.align 32
>BSF:
>// x$ = r32
>// Output regs: None
>// File c:\repro\bb.c
> {   .mmi  //R-Addr: 0X00
>	adds	r31=-1, r32;;				    //12.	cc:0
>	xor	r30=r31, r32				    //12.	cc:1
>	nop.i	 0;;
> }
> {   .mii  //R-Addr: 0X010
>	nop.m	 0
>	popcnt	r29=r30;;				    //12.	cc:4
>	adds	r8=-1, r29				    //13.	cc:7
> }
> {   .mmb  //R-Addr: 0X020
>	nop.m	 0
>	nop.m	 0
>	br.ret.sptk.many b0;;				    //14 	cc:7
> }
>// End code for function:
>	.endp	BSF#
>	.type	BSR#	,@function
>        .global BSR#
>	.type	u8bit#	,@object
>        .global u8bit#
>// Function compile flags: /Ogtp
>	.section	.text
>
>// Begin code for function: BSR:
>	.proc	BSR#
>	.align 32
>BSR:
>// x$ = r32
>// Output regs: None
> {   .mii  //R-Addr: 0X00
>	addl	r31=@ltoff(u8bit#),gp			    //47.Rm	cc:0
>	shr.u	r27=r32, 32;;				    //23.	cc:0
>	cmp.ne.unc p7,p6=r0, r27			    //25.	cc:1
> }
> {   .mmi  //R-Addr: 0X010
>	ld8	r25=[r31];;				    //47.Rm	cc:1
>  (p6)	mov	r27=r32					    //26.	cc:2
>  (p7)	extr.u	r29=r32, 32, 16				    //32.	cc:2
> }
> {   .mii  //R-Addr: 0X020
>  (p7)	mov	r28=32					    //33.	cc:2
>  (p6)	zxt2	r29=r32					    //27.	cc:2
>  (p6)	mov	r28=r0;;				    //28.	cc:2
> }
> {   .mii  //R-Addr: 0X030
>	cmp.ne.unc p9,p8=r29, r27			    //36.	cc:3
>	shr.u	r30=r27, 16;;				    //35.	cc:3
>  (p9)	zxt1	r22=r30					    //40.	cc:4
> }
> {   .mii  //R-Addr: 0X040
>  (p9)	mov	r27=r30					    //39.	cc:4
>  (p8)	zxt1	r22=r27					    //37.	cc:4
>  (p9)	adds	r28=16, r28;;				    //41.	cc:4
> }
> {   .mii  //R-Addr: 0X050
>	add	r26=r27, r25				    //45.R	cc:5
>	shr.u	r21=r27, 8				    //43.R	cc:5
>	cmp.ne.unc p10,p0=r27, r22;;			    //44.	cc:5
> }
> {   .mib  //R-Addr: 0X060
>  (p10)	add	r26=r21, r25				    //47.	cc:6
>  (p10)	adds	r28=8, r28				    //48.	cc:6
>	nop.b	 0;;
> }
> {   .mmi  //R-Addr: 0X070
>	ld1	r31=[r26];;				    //50.	cc:7
>	add	r8=r31, r28				    //50.	cc:8
>	nop.i	 0
> }
> {   .mmb  //R-Addr: 0X080
>	nop.m	 0
>	nop.m	 0
>	br.ret.sptk.many b0;;				    //51 	cc:8
> }
>// End code for function:
>	.endp	BSR#
>
>// Total code size for all functions: 0X0c0 bytes (12 bundles)
>
>// END
>
>--------------------------------------------------------------------------
>
>:-)
>
>Thanks,
>Eugene
>
>>But i have to wait some time, until i can try it ;-(
>>
>>Cheers,
>>Gerd
Re: Crafty profits little from Itanium and Opteron versus Commercials Ricardo Gibert 17:38:27 08/09/03
- Re: Crafty profits little from Itanium and Opteron versus Commercials Gerd Isenberg 23:28:12 08/09/03
small improvement Gerd Isenberg 13:45:16 08/09/03
This page took 0.01 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.