Author: Gerd Isenberg
Date: 13:12:28 08/09/03
Go up one level in this thread
On August 08, 2003 at 15:53:00, Eugene Nalimov wrote:
>On August 07, 2003 at 15:40:33, Gerd Isenberg wrote:
>
>>...
>>3. bsf, still vector path and 9 cycles.
>
>Hmm, on Itanium2 I can do BSF/BSR equivalent in 8/9 clocks without BSF/BSR
>instructions:
Hi Eugene,
I see, Itanium2 has popcount, thanks for the lession.
I refered to opteron's bsf, but anyway, nice lession how to use intrinsics.
My bitscan collection grows and grows ;-)
I'm interested in performance of opterons bsf/btr instructions versus
Matt Taylor's approach with 64-bit magic de Bruijn multiplication, may be even
without lookup:
typedef unsigned long long BitBoard;
int bitScanAndReset(BitBoard &bb)
{
BitBoard lsb = bb ^ (bb - 1);
bb &= ~lsb;
return tbl64[(lsb * 0x03f08c5392cd5dbd) >> (64 - 6)];
}
I guess a 32-bit folded version is inferiour on opeteron, because there is a
real shift >> 32 with 64 bit registers.
int bitScanAndReset(BitBoard &bb)
{
BitBoard lsb = bb ^ (bb - 1);
bb &= ~lsb;
unsigned __int32 foldedLSB = ((int) lsb) ^ ((int)(lsb>>32));
return lsz64_tbl[(foldedLSB * 0x78291ACF) >> (32-6)];
}
Thanks,
Gerd
>
>--------------------------------------------------------------------------
>
>union _m64 {
> unsigned long long u64;
> // Other stuff deleted as irrelevant
>};
>
>extern union _m64 __m64_popcnt(unsigned long long);
>#pragma intrinsic(__m64_popcnt)
>
>unsigned long long BSF (unsigned long long x)
>{
> union _m64 y;
>
> y = __m64_popcnt(x^(x-1));
> return y.u64-1;
>}
>
>extern unsigned char u8bit[256];
>unsigned long long BSR (unsigned long long x)
>{
> unsigned long long r, s, t, u, w, y, z;
> unsigned long long result;
> unsigned char *p;
>
> y = x >> 32;
> p = u8bit;
> if (y == 0) {
> t = x;
> u = (unsigned short) x;
> result = 0;
> }
> else {
> t = y;
> u = (unsigned short) y;
> result = 32;
> }
> w = t >> 16;
> if (u == t)
> r = (unsigned char) t;
> else {
> t = w;
> r = (unsigned char) w;
> result += 16;
> }
> s = t >> 8;
> if (t == r)
> p += t;
> else {
> p += s;
> result += 8;
> }
> return result + *p;
>}
>
>--------------------------------------------------------------------------
>
>// Listing generated by Microsoft (R) Optimizing Compiler Version 14.00.30807
>
> .file "C:/repro/bb.c"
> .radix D
> .section .text, "ax", "progbits"
> .align 32
> .section .pdata, "a", "progbits"
> .align 4
> .section .xdata, "a", "progbits"
> .align 8
> .section .data, "wa", "progbits"
> .align 16
> .section .rdata, "a", "progbits"
> .align 16
> .section .bss, "wa", "nobits"
> .align 16
> .section .debug$S, "ax", "progbits"
> .align 16
> .section .tls$, "was", "progbits"
> .align 16
> .section .sdata, "was", "progbits"
> .align 16
> .section .sbss, "was", "nobits"
> .align 16
> .section .srdata, "as", "progbits"
> .align 16
> .section .rdata, "a", "progbits"
> .align 16
> .type BSF# ,@function
> .global BSF#
>// Function compile flags: /Ogtp
> .section .text
>
>// Begin code for function: BSF:
> .proc BSF#
> .align 32
>BSF:
>// x$ = r32
>// Output regs: None
>// File c:\repro\bb.c
> { .mmi //R-Addr: 0X00
> adds r31=-1, r32;; //12. cc:0
> xor r30=r31, r32 //12. cc:1
> nop.i 0;;
> }
> { .mii //R-Addr: 0X010
> nop.m 0
> popcnt r29=r30;; //12. cc:4
> adds r8=-1, r29 //13. cc:7
> }
> { .mmb //R-Addr: 0X020
> nop.m 0
> nop.m 0
> br.ret.sptk.many b0;; //14 cc:7
> }
>// End code for function:
> .endp BSF#
> .type BSR# ,@function
> .global BSR#
> .type u8bit# ,@object
> .global u8bit#
>// Function compile flags: /Ogtp
> .section .text
>
>// Begin code for function: BSR:
> .proc BSR#
> .align 32
>BSR:
>// x$ = r32
>// Output regs: None
> { .mii //R-Addr: 0X00
> addl r31=@ltoff(u8bit#),gp //47.Rm cc:0
> shr.u r27=r32, 32;; //23. cc:0
> cmp.ne.unc p7,p6=r0, r27 //25. cc:1
> }
> { .mmi //R-Addr: 0X010
> ld8 r25=[r31];; //47.Rm cc:1
> (p6) mov r27=r32 //26. cc:2
> (p7) extr.u r29=r32, 32, 16 //32. cc:2
> }
> { .mii //R-Addr: 0X020
> (p7) mov r28=32 //33. cc:2
> (p6) zxt2 r29=r32 //27. cc:2
> (p6) mov r28=r0;; //28. cc:2
> }
> { .mii //R-Addr: 0X030
> cmp.ne.unc p9,p8=r29, r27 //36. cc:3
> shr.u r30=r27, 16;; //35. cc:3
> (p9) zxt1 r22=r30 //40. cc:4
> }
> { .mii //R-Addr: 0X040
> (p9) mov r27=r30 //39. cc:4
> (p8) zxt1 r22=r27 //37. cc:4
> (p9) adds r28=16, r28;; //41. cc:4
> }
> { .mii //R-Addr: 0X050
> add r26=r27, r25 //45.R cc:5
> shr.u r21=r27, 8 //43.R cc:5
> cmp.ne.unc p10,p0=r27, r22;; //44. cc:5
> }
> { .mib //R-Addr: 0X060
> (p10) add r26=r21, r25 //47. cc:6
> (p10) adds r28=8, r28 //48. cc:6
> nop.b 0;;
> }
> { .mmi //R-Addr: 0X070
> ld1 r31=[r26];; //50. cc:7
> add r8=r31, r28 //50. cc:8
> nop.i 0
> }
> { .mmb //R-Addr: 0X080
> nop.m 0
> nop.m 0
> br.ret.sptk.many b0;; //51 cc:8
> }
>// End code for function:
> .endp BSR#
>
>// Total code size for all functions: 0X0c0 bytes (12 bundles)
>
>// END
>
>--------------------------------------------------------------------------
>
>:-)
>
>Thanks,
>Eugene
>
>>But i have to wait some time, until i can try it ;-(
>>
>>Cheers,
>>Gerd
This page took 0.01 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.