Author: Eugene Nalimov
Date: 12:53:00 08/08/03
Go up one level in this thread
On August 07, 2003 at 15:40:33, Gerd Isenberg wrote: >... >3. bsf, still vector path and 9 cycles. Hmm, on Itanium2 I can do BSF/BSR equivalent in 8/9 clocks without BSF/BSR instructions: -------------------------------------------------------------------------- union _m64 { unsigned long long u64; // Other stuff deleted as irrelevant }; extern union _m64 __m64_popcnt(unsigned long long); #pragma intrinsic(__m64_popcnt) unsigned long long BSF (unsigned long long x) { union _m64 y; y = __m64_popcnt(x^(x-1)); return y.u64-1; } extern unsigned char u8bit[256]; unsigned long long BSR (unsigned long long x) { unsigned long long r, s, t, u, w, y, z; unsigned long long result; unsigned char *p; y = x >> 32; p = u8bit; if (y == 0) { t = x; u = (unsigned short) x; result = 0; } else { t = y; u = (unsigned short) y; result = 32; } w = t >> 16; if (u == t) r = (unsigned char) t; else { t = w; r = (unsigned char) w; result += 16; } s = t >> 8; if (t == r) p += t; else { p += s; result += 8; } return result + *p; } -------------------------------------------------------------------------- // Listing generated by Microsoft (R) Optimizing Compiler Version 14.00.30807 .file "C:/repro/bb.c" .radix D .section .text, "ax", "progbits" .align 32 .section .pdata, "a", "progbits" .align 4 .section .xdata, "a", "progbits" .align 8 .section .data, "wa", "progbits" .align 16 .section .rdata, "a", "progbits" .align 16 .section .bss, "wa", "nobits" .align 16 .section .debug$S, "ax", "progbits" .align 16 .section .tls$, "was", "progbits" .align 16 .section .sdata, "was", "progbits" .align 16 .section .sbss, "was", "nobits" .align 16 .section .srdata, "as", "progbits" .align 16 .section .rdata, "a", "progbits" .align 16 .type BSF# ,@function .global BSF# // Function compile flags: /Ogtp .section .text // Begin code for function: BSF: .proc BSF# .align 32 BSF: // x$ = r32 // Output regs: None // File c:\repro\bb.c { .mmi //R-Addr: 0X00 adds r31=-1, r32;; //12. cc:0 xor r30=r31, r32 //12. cc:1 nop.i 0;; } { .mii //R-Addr: 0X010 nop.m 0 popcnt r29=r30;; //12. cc:4 adds r8=-1, r29 //13. cc:7 } { .mmb //R-Addr: 0X020 nop.m 0 nop.m 0 br.ret.sptk.many b0;; //14 cc:7 } // End code for function: .endp BSF# .type BSR# ,@function .global BSR# .type u8bit# ,@object .global u8bit# // Function compile flags: /Ogtp .section .text // Begin code for function: BSR: .proc BSR# .align 32 BSR: // x$ = r32 // Output regs: None { .mii //R-Addr: 0X00 addl r31=@ltoff(u8bit#),gp //47.Rm cc:0 shr.u r27=r32, 32;; //23. cc:0 cmp.ne.unc p7,p6=r0, r27 //25. cc:1 } { .mmi //R-Addr: 0X010 ld8 r25=[r31];; //47.Rm cc:1 (p6) mov r27=r32 //26. cc:2 (p7) extr.u r29=r32, 32, 16 //32. cc:2 } { .mii //R-Addr: 0X020 (p7) mov r28=32 //33. cc:2 (p6) zxt2 r29=r32 //27. cc:2 (p6) mov r28=r0;; //28. cc:2 } { .mii //R-Addr: 0X030 cmp.ne.unc p9,p8=r29, r27 //36. cc:3 shr.u r30=r27, 16;; //35. cc:3 (p9) zxt1 r22=r30 //40. cc:4 } { .mii //R-Addr: 0X040 (p9) mov r27=r30 //39. cc:4 (p8) zxt1 r22=r27 //37. cc:4 (p9) adds r28=16, r28;; //41. cc:4 } { .mii //R-Addr: 0X050 add r26=r27, r25 //45.R cc:5 shr.u r21=r27, 8 //43.R cc:5 cmp.ne.unc p10,p0=r27, r22;; //44. cc:5 } { .mib //R-Addr: 0X060 (p10) add r26=r21, r25 //47. cc:6 (p10) adds r28=8, r28 //48. cc:6 nop.b 0;; } { .mmi //R-Addr: 0X070 ld1 r31=[r26];; //50. cc:7 add r8=r31, r28 //50. cc:8 nop.i 0 } { .mmb //R-Addr: 0X080 nop.m 0 nop.m 0 br.ret.sptk.many b0;; //51 cc:8 } // End code for function: .endp BSR# // Total code size for all functions: 0X0c0 bytes (12 bundles) // END -------------------------------------------------------------------------- :-) Thanks, Eugene >But i have to wait some time, until i can try it ;-( > >Cheers, >Gerd
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.