Author: Eugene Nalimov
Date: 12:53:00 08/08/03
Go up one level in this thread
On August 07, 2003 at 15:40:33, Gerd Isenberg wrote:
>...
>3. bsf, still vector path and 9 cycles.
Hmm, on Itanium2 I can do BSF/BSR equivalent in 8/9 clocks without BSF/BSR
instructions:
--------------------------------------------------------------------------
union _m64 {
unsigned long long u64;
// Other stuff deleted as irrelevant
};
extern union _m64 __m64_popcnt(unsigned long long);
#pragma intrinsic(__m64_popcnt)
unsigned long long BSF (unsigned long long x)
{
union _m64 y;
y = __m64_popcnt(x^(x-1));
return y.u64-1;
}
extern unsigned char u8bit[256];
unsigned long long BSR (unsigned long long x)
{
unsigned long long r, s, t, u, w, y, z;
unsigned long long result;
unsigned char *p;
y = x >> 32;
p = u8bit;
if (y == 0) {
t = x;
u = (unsigned short) x;
result = 0;
}
else {
t = y;
u = (unsigned short) y;
result = 32;
}
w = t >> 16;
if (u == t)
r = (unsigned char) t;
else {
t = w;
r = (unsigned char) w;
result += 16;
}
s = t >> 8;
if (t == r)
p += t;
else {
p += s;
result += 8;
}
return result + *p;
}
--------------------------------------------------------------------------
// Listing generated by Microsoft (R) Optimizing Compiler Version 14.00.30807
.file "C:/repro/bb.c"
.radix D
.section .text, "ax", "progbits"
.align 32
.section .pdata, "a", "progbits"
.align 4
.section .xdata, "a", "progbits"
.align 8
.section .data, "wa", "progbits"
.align 16
.section .rdata, "a", "progbits"
.align 16
.section .bss, "wa", "nobits"
.align 16
.section .debug$S, "ax", "progbits"
.align 16
.section .tls$, "was", "progbits"
.align 16
.section .sdata, "was", "progbits"
.align 16
.section .sbss, "was", "nobits"
.align 16
.section .srdata, "as", "progbits"
.align 16
.section .rdata, "a", "progbits"
.align 16
.type BSF# ,@function
.global BSF#
// Function compile flags: /Ogtp
.section .text
// Begin code for function: BSF:
.proc BSF#
.align 32
BSF:
// x$ = r32
// Output regs: None
// File c:\repro\bb.c
{ .mmi //R-Addr: 0X00
adds r31=-1, r32;; //12. cc:0
xor r30=r31, r32 //12. cc:1
nop.i 0;;
}
{ .mii //R-Addr: 0X010
nop.m 0
popcnt r29=r30;; //12. cc:4
adds r8=-1, r29 //13. cc:7
}
{ .mmb //R-Addr: 0X020
nop.m 0
nop.m 0
br.ret.sptk.many b0;; //14 cc:7
}
// End code for function:
.endp BSF#
.type BSR# ,@function
.global BSR#
.type u8bit# ,@object
.global u8bit#
// Function compile flags: /Ogtp
.section .text
// Begin code for function: BSR:
.proc BSR#
.align 32
BSR:
// x$ = r32
// Output regs: None
{ .mii //R-Addr: 0X00
addl r31=@ltoff(u8bit#),gp //47.Rm cc:0
shr.u r27=r32, 32;; //23. cc:0
cmp.ne.unc p7,p6=r0, r27 //25. cc:1
}
{ .mmi //R-Addr: 0X010
ld8 r25=[r31];; //47.Rm cc:1
(p6) mov r27=r32 //26. cc:2
(p7) extr.u r29=r32, 32, 16 //32. cc:2
}
{ .mii //R-Addr: 0X020
(p7) mov r28=32 //33. cc:2
(p6) zxt2 r29=r32 //27. cc:2
(p6) mov r28=r0;; //28. cc:2
}
{ .mii //R-Addr: 0X030
cmp.ne.unc p9,p8=r29, r27 //36. cc:3
shr.u r30=r27, 16;; //35. cc:3
(p9) zxt1 r22=r30 //40. cc:4
}
{ .mii //R-Addr: 0X040
(p9) mov r27=r30 //39. cc:4
(p8) zxt1 r22=r27 //37. cc:4
(p9) adds r28=16, r28;; //41. cc:4
}
{ .mii //R-Addr: 0X050
add r26=r27, r25 //45.R cc:5
shr.u r21=r27, 8 //43.R cc:5
cmp.ne.unc p10,p0=r27, r22;; //44. cc:5
}
{ .mib //R-Addr: 0X060
(p10) add r26=r21, r25 //47. cc:6
(p10) adds r28=8, r28 //48. cc:6
nop.b 0;;
}
{ .mmi //R-Addr: 0X070
ld1 r31=[r26];; //50. cc:7
add r8=r31, r28 //50. cc:8
nop.i 0
}
{ .mmb //R-Addr: 0X080
nop.m 0
nop.m 0
br.ret.sptk.many b0;; //51 cc:8
}
// End code for function:
.endp BSR#
// Total code size for all functions: 0X0c0 bytes (12 bundles)
// END
--------------------------------------------------------------------------
:-)
Thanks,
Eugene
>But i have to wait some time, until i can try it ;-(
>
>Cheers,
>Gerd
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.