Author: Matt Taylor
Date: 23:47:25 01/21/03
Go up one level in this thread
On January 21, 2003 at 18:42:47, David Rasmussen wrote: >Thanks a lot for the answers! > >It is disappointing to hear that it cannot be done on MSVC/Intel. How about gcc? >Is it possible to make such fast assembler inline functions for gcc? >Will anybody do it? > >/David It is (sort've) possible on gcc. MSVC/Intel have the convenient asm-block syntax. Since there is no way to tell the compiler where you want things and where you put your outputs, it has to be passed through memory. Worse yet, it has the added side-effect that the compiler has to stash unwritten temp values to memory (in case you trash the register they occupy). GCC is much more annoying to deal with syntactically (since you have to format the assembly pedantically for gas), but it allows for better optimization opportunities. You make a contract with gcc. You tell gcc which locations are acceptable (memory, register, etc.) for inputs and outputs. The compiler will then arrange for the value to be in an appropriate location for you. It is trivial to convert MSVC/Intel -> GCC. I'll post what I've done thus far below so that it can be properly criticized, and you can incorporate it also. I've made notes about the speed of the functions relative to their C equivalents as well as other suggestions. It is worth reading the comments. Note that all cycle counts are estimates. Athlon has some weird issued with back-to-back cmov/set instructions, and I have not actually -measured- the speed of any of these routines. // Better than call to shift function BitBoard Mask(Square square) { BitBoard bb; _asm { ; 4 cycles + memory penalties mov ecx, square mov edx, 1 shl edx, cl test ecx, 32 mov ecx, 0 cmovz eax, edx cmovz edx, ecx mov DWORD PTR [bb], eax mov DWORD PTR [bb+4], edx } return bb; } // Push shift back to here to avoid unnecessary manipulation of constants // Alternatively, if you are able to, change the values of the rank constants. // You may be playing games elsewhere with the rank, but if you extract rank // from a square, you can mask instead of shifting. #define RankMask(r) RankMask1((r) << 3) // Roughly equal in speed to table-based, less memory utilization BitBoard RankMask1(Rank rank) { BitBoard bb; _asm { ; 4 cycles + memory penalties mov ecx, rank mov eax, 0xFF shl eax, cl test ecx, 32 mov ecx, 0 cmovnz edx, eax cmovnz eax, ecx mov DWORD PTR [bb], eax mov DWORD PTR [bb+4], edx } return bb; } // Roughly equal in speed to table-based, less memory utilization BitBoard FileMask(File file) { BitBoard bb; _asm { mov eax, 0x01010101 mov ecx, file shl eax, cl mov DWORD PTR [bb], eax mov DWORD PTR [bb+4], eax } return bb; } // Roughly equal in speed to table-based, less memory utilization void Set(BitBoard &bb, Square square) { _asm { mov ecx, square mov edx, 1 mov eax, bb shl edx, cl shr ecx, 5 or [eax+ecx*4], edx } } // Roughly equal in speed to table-based, less memory utilization void Clear(BitBoard &bb, Square square) { _asm { mov ecx, square mov edx, -2 mov eax, bb rol edx, cl shr ecx, 5 and [eax+ecx*4], edx } } // Roughly equal in speed to table-based, less memory utilization void Toggle(BitBoard &bb, Square square) { _asm { mov ecx, square mov edx, 1 mov eax, bb shl edx, cl shl ecx, 5 xor [eax+ecx*4], edx } } // define __FAST_BSF for Pentium 2/3 // define __NO_TABLES to compare performance against the table-based version #if !defined(__FAST_BSF) && !defined(__NO_TABLES) #define LSB_32_SIZE 134 // Fast LSB algorithm courtesy of Walter Faxon! static const uint8 LSB_32_table[LSB_32_SIZE * 2] = { #define __ 0 23,16,17,__,18,10, 8, 9,19,11, 31,__,__,__,__,__,20,12,__,__, __,__,__,__,__,__,__,__,__,__, __,__,21,13,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__, __,__,__,__,22,14,__,__,__,__, 6,__,__,__,30,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__, __,__, 5,__,__,__,29,__,__,__, 3,__,__,__, 2,__, 1, 0, 4,__, __,__,28,__,__,__,26,24,25,15, 27,__,__, 7, 55,48,49,__,50,42,40,41,51,43, 63,__,__,__,__,__,52,44,__,__, __,__,__,__,__,__,__,__,__,__, __,__,53,45,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__, __,__,__,__,54,46,__,__,__,__, 38,__,__,__,62,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__, __,__,37,__,__,__,61,__,__,__, 35,__,__,__,34,__,33,32,36,__, __,__,60,__,__,__,58,56,57,47, 59,__,__,39 #undef __ }; #endif /* !defined(__FAST_BSF) && !defined(__NO_TABLES) */ INLINE int FirstBit(const BitBoard bb) { int nBit; __asm { #if defined(__FAST_BSF) || defined(__NO_TABLES) mov ecx, DWORD PTR [bb] mov edx, DWORD PTR [bb+4] xor eax, eax cmp ecx, 1 cmovc ecx, edx adc eax, eax mov edx, 1 bsf ecx, ecx shl edx, cl shl eax, 5 or eax, ecx mov [nBit], eax #else mov edx, DWORD PTR [bb] mov ecx, DWORD PTR [bb+4] xor eax, eax cmp edx, 1 cmovc edx, ecx adc eax, eax mov ecx, edx neg edx shl eax, 5 and edx, ecx mov [nBit], eax dec edx xor edx, 0x05407DB7 mov ecx, edx shr ecx, 16 add edx, ecx sub dl, dh movzx ecx, dl xor edx, edx mov cl, BYTE PTR [LSB_32_table+ecx] mov [nBit], cl #endif /* defined(__FAST_BSF) || defined(__NO_TABLES) */ } return nBit; } // No table-based version for this INLINE int LastBit(const BitBoard bb) { int nBit; __asm { mov edx, DWORD PTR [bb] mov ecx, DWORD PTR [bb+4] cmp ecx, 1 cmovc ecx, edx sbb eax, eax bsr ecx, ecx inc eax shl eax, 5 or eax, ecx mov [nBit], eax } return nBit; } // Faster than MMX at the expense of extra registers (I think) // COULD be faster yet! INLINE int PopCount(BitBoard a) { int nCount; __asm { mov eax, DWORD PTR [a] mov ecx, DWORD PTR [a+4] mov edx, eax shr eax, 1 mov ebx, ecx and eax, 0x55555555 shr ecx, 1 and ecx, 0x55555555 sub edx, eax mov eax, edx shr edx, 2 sub ebx, ecx mov ecx, ebx and eax, 0x33333333 and edx, 0x33333333 shr ebx, 2 add eax, edx mov edx, eax shr eax, 4 and ecx, 0x33333333 and ebx, 0x33333333 add eax, edx and eax, 0x0F0F0F0F add ecx, ebx mov ebx, ecx shr ecx, 4 add ecx, ebx and ecx, 0x0F0F0F0F add eax, ecx imul eax, 0x01010101 shr eax, 24 mov [nCount], eax } return nCount; } -Matt
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.