Computer Chess Club Archives


Search

Terms

Messages

Subject: Re: Crafty profits little from Itanium and Opteron versus Commercials

Author: Eugene Nalimov

Date: 12:53:00 08/08/03

Go up one level in this thread


On August 07, 2003 at 15:40:33, Gerd Isenberg wrote:

>...
>3. bsf, still vector path and 9 cycles.

Hmm, on Itanium2 I can do BSF/BSR equivalent in 8/9 clocks without BSF/BSR
instructions:

--------------------------------------------------------------------------

union _m64 {
    unsigned long long u64;
    // Other stuff deleted as irrelevant
};

extern union _m64 __m64_popcnt(unsigned long long);
#pragma intrinsic(__m64_popcnt)

unsigned long long BSF (unsigned long long x)
{
    union _m64 y;

    y = __m64_popcnt(x^(x-1));
    return y.u64-1;
}

extern unsigned char u8bit[256];
unsigned long long BSR (unsigned long long x)
{
    unsigned long long r, s, t, u, w, y, z;
    unsigned long long result;
    unsigned char *p;

    y = x >> 32;
    p = u8bit;
    if (y == 0) {
        t = x;
        u = (unsigned short) x;
        result = 0;
    }
    else {
        t = y;
        u = (unsigned short) y;
        result = 32;
    }
    w = t >> 16;
    if (u == t)
        r = (unsigned char) t;
    else {
        t = w;
        r = (unsigned char) w;
        result += 16;
    }
    s = t >> 8;
    if (t == r)
        p += t;
    else {
        p += s;
        result += 8;
    }
    return result + *p;
}

--------------------------------------------------------------------------

// Listing generated by Microsoft (R) Optimizing Compiler Version 14.00.30807

	.file	"C:/repro/bb.c"
	.radix	D
	.section	.text,	"ax", "progbits"
	.align 32
	.section	.pdata,	"a", "progbits"
	.align 4
	.section	.xdata,	"a", "progbits"
	.align 8
	.section	.data,	"wa", "progbits"
	.align 16
	.section	.rdata,	"a", "progbits"
	.align 16
	.section	.bss,	"wa", "nobits"
	.align 16
	.section	.debug$S,	"ax", "progbits"
	.align 16
	.section	.tls$,	"was", "progbits"
	.align 16
	.section	.sdata,	"was", "progbits"
	.align 16
	.section	.sbss,	"was", "nobits"
	.align 16
	.section	.srdata,	"as", "progbits"
	.align 16
	.section	.rdata,	"a", "progbits"
	.align 16
	.type	BSF#	,@function
        .global BSF#
// Function compile flags: /Ogtp
	.section	.text

// Begin code for function: BSF:
	.proc	BSF#
	.align 32
BSF:
// x$ = r32
// Output regs: None
// File c:\repro\bb.c
 {   .mmi  //R-Addr: 0X00
	adds	r31=-1, r32;;				    //12.	cc:0
	xor	r30=r31, r32				    //12.	cc:1
	nop.i	 0;;
 }
 {   .mii  //R-Addr: 0X010
	nop.m	 0
	popcnt	r29=r30;;				    //12.	cc:4
	adds	r8=-1, r29				    //13.	cc:7
 }
 {   .mmb  //R-Addr: 0X020
	nop.m	 0
	nop.m	 0
	br.ret.sptk.many b0;;				    //14 	cc:7
 }
// End code for function:
	.endp	BSF#
	.type	BSR#	,@function
        .global BSR#
	.type	u8bit#	,@object
        .global u8bit#
// Function compile flags: /Ogtp
	.section	.text

// Begin code for function: BSR:
	.proc	BSR#
	.align 32
BSR:
// x$ = r32
// Output regs: None
 {   .mii  //R-Addr: 0X00
	addl	r31=@ltoff(u8bit#),gp			    //47.Rm	cc:0
	shr.u	r27=r32, 32;;				    //23.	cc:0
	cmp.ne.unc p7,p6=r0, r27			    //25.	cc:1
 }
 {   .mmi  //R-Addr: 0X010
	ld8	r25=[r31];;				    //47.Rm	cc:1
  (p6)	mov	r27=r32					    //26.	cc:2
  (p7)	extr.u	r29=r32, 32, 16				    //32.	cc:2
 }
 {   .mii  //R-Addr: 0X020
  (p7)	mov	r28=32					    //33.	cc:2
  (p6)	zxt2	r29=r32					    //27.	cc:2
  (p6)	mov	r28=r0;;				    //28.	cc:2
 }
 {   .mii  //R-Addr: 0X030
	cmp.ne.unc p9,p8=r29, r27			    //36.	cc:3
	shr.u	r30=r27, 16;;				    //35.	cc:3
  (p9)	zxt1	r22=r30					    //40.	cc:4
 }
 {   .mii  //R-Addr: 0X040
  (p9)	mov	r27=r30					    //39.	cc:4
  (p8)	zxt1	r22=r27					    //37.	cc:4
  (p9)	adds	r28=16, r28;;				    //41.	cc:4
 }
 {   .mii  //R-Addr: 0X050
	add	r26=r27, r25				    //45.R	cc:5
	shr.u	r21=r27, 8				    //43.R	cc:5
	cmp.ne.unc p10,p0=r27, r22;;			    //44.	cc:5
 }
 {   .mib  //R-Addr: 0X060
  (p10)	add	r26=r21, r25				    //47.	cc:6
  (p10)	adds	r28=8, r28				    //48.	cc:6
	nop.b	 0;;
 }
 {   .mmi  //R-Addr: 0X070
	ld1	r31=[r26];;				    //50.	cc:7
	add	r8=r31, r28				    //50.	cc:8
	nop.i	 0
 }
 {   .mmb  //R-Addr: 0X080
	nop.m	 0
	nop.m	 0
	br.ret.sptk.many b0;;				    //51 	cc:8
 }
// End code for function:
	.endp	BSR#

// Total code size for all functions: 0X0c0 bytes (12 bundles)

// END

--------------------------------------------------------------------------

:-)

Thanks,
Eugene

>But i have to wait some time, until i can try it ;-(
>
>Cheers,
>Gerd



This page took 0 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.