Author: Matt Taylor
Date: 23:47:25 01/21/03
Go up one level in this thread
On January 21, 2003 at 18:42:47, David Rasmussen wrote:
>Thanks a lot for the answers!
>
>It is disappointing to hear that it cannot be done on MSVC/Intel. How about gcc?
>Is it possible to make such fast assembler inline functions for gcc?
>Will anybody do it?
>
>/David
It is (sort've) possible on gcc. MSVC/Intel have the convenient asm-block
syntax. Since there is no way to tell the compiler where you want things and
where you put your outputs, it has to be passed through memory. Worse yet, it
has the added side-effect that the compiler has to stash unwritten temp values
to memory (in case you trash the register they occupy).
GCC is much more annoying to deal with syntactically (since you have to format
the assembly pedantically for gas), but it allows for better optimization
opportunities. You make a contract with gcc. You tell gcc which locations are
acceptable (memory, register, etc.) for inputs and outputs. The compiler will
then arrange for the value to be in an appropriate location for you.
It is trivial to convert MSVC/Intel -> GCC.
I'll post what I've done thus far below so that it can be properly criticized,
and you can incorporate it also. I've made notes about the speed of the
functions relative to their C equivalents as well as other suggestions. It is
worth reading the comments.
Note that all cycle counts are estimates. Athlon has some weird issued with
back-to-back cmov/set instructions, and I have not actually -measured- the speed
of any of these routines.
// Better than call to shift function
BitBoard Mask(Square square)
{
BitBoard bb;
_asm
{
; 4 cycles + memory penalties
mov ecx, square
mov edx, 1
shl edx, cl
test ecx, 32
mov ecx, 0
cmovz eax, edx
cmovz edx, ecx
mov DWORD PTR [bb], eax
mov DWORD PTR [bb+4], edx
}
return bb;
}
// Push shift back to here to avoid unnecessary manipulation of constants
// Alternatively, if you are able to, change the values of the rank constants.
// You may be playing games elsewhere with the rank, but if you extract rank
// from a square, you can mask instead of shifting.
#define RankMask(r) RankMask1((r) << 3)
// Roughly equal in speed to table-based, less memory utilization
BitBoard RankMask1(Rank rank)
{
BitBoard bb;
_asm
{
; 4 cycles + memory penalties
mov ecx, rank
mov eax, 0xFF
shl eax, cl
test ecx, 32
mov ecx, 0
cmovnz edx, eax
cmovnz eax, ecx
mov DWORD PTR [bb], eax
mov DWORD PTR [bb+4], edx
}
return bb;
}
// Roughly equal in speed to table-based, less memory utilization
BitBoard FileMask(File file)
{
BitBoard bb;
_asm
{
mov eax, 0x01010101
mov ecx, file
shl eax, cl
mov DWORD PTR [bb], eax
mov DWORD PTR [bb+4], eax
}
return bb;
}
// Roughly equal in speed to table-based, less memory utilization
void Set(BitBoard &bb, Square square)
{
_asm
{
mov ecx, square
mov edx, 1
mov eax, bb
shl edx, cl
shr ecx, 5
or [eax+ecx*4], edx
}
}
// Roughly equal in speed to table-based, less memory utilization
void Clear(BitBoard &bb, Square square)
{
_asm
{
mov ecx, square
mov edx, -2
mov eax, bb
rol edx, cl
shr ecx, 5
and [eax+ecx*4], edx
}
}
// Roughly equal in speed to table-based, less memory utilization
void Toggle(BitBoard &bb, Square square)
{
_asm
{
mov ecx, square
mov edx, 1
mov eax, bb
shl edx, cl
shl ecx, 5
xor [eax+ecx*4], edx
}
}
// define __FAST_BSF for Pentium 2/3
// define __NO_TABLES to compare performance against the table-based version
#if !defined(__FAST_BSF) && !defined(__NO_TABLES)
#define LSB_32_SIZE 134
// Fast LSB algorithm courtesy of Walter Faxon!
static const uint8 LSB_32_table[LSB_32_SIZE * 2] =
{
#define __ 0
23,16,17,__,18,10, 8, 9,19,11, 31,__,__,__,__,__,20,12,__,__,
__,__,__,__,__,__,__,__,__,__, __,__,21,13,__,__,__,__,__,__,
__,__,__,__,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__,
__,__,__,__,22,14,__,__,__,__, 6,__,__,__,30,__,__,__,__,__,
__,__,__,__,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__,
__,__, 5,__,__,__,29,__,__,__, 3,__,__,__, 2,__, 1, 0, 4,__,
__,__,28,__,__,__,26,24,25,15, 27,__,__, 7,
55,48,49,__,50,42,40,41,51,43, 63,__,__,__,__,__,52,44,__,__,
__,__,__,__,__,__,__,__,__,__, __,__,53,45,__,__,__,__,__,__,
__,__,__,__,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__,
__,__,__,__,54,46,__,__,__,__, 38,__,__,__,62,__,__,__,__,__,
__,__,__,__,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__,
__,__,37,__,__,__,61,__,__,__, 35,__,__,__,34,__,33,32,36,__,
__,__,60,__,__,__,58,56,57,47, 59,__,__,39
#undef __
};
#endif /* !defined(__FAST_BSF) && !defined(__NO_TABLES) */
INLINE int FirstBit(const BitBoard bb)
{
int nBit;
__asm
{
#if defined(__FAST_BSF) || defined(__NO_TABLES)
mov ecx, DWORD PTR [bb]
mov edx, DWORD PTR [bb+4]
xor eax, eax
cmp ecx, 1
cmovc ecx, edx
adc eax, eax
mov edx, 1
bsf ecx, ecx
shl edx, cl
shl eax, 5
or eax, ecx
mov [nBit], eax
#else
mov edx, DWORD PTR [bb]
mov ecx, DWORD PTR [bb+4]
xor eax, eax
cmp edx, 1
cmovc edx, ecx
adc eax, eax
mov ecx, edx
neg edx
shl eax, 5
and edx, ecx
mov [nBit], eax
dec edx
xor edx, 0x05407DB7
mov ecx, edx
shr ecx, 16
add edx, ecx
sub dl, dh
movzx ecx, dl
xor edx, edx
mov cl, BYTE PTR [LSB_32_table+ecx]
mov [nBit], cl
#endif /* defined(__FAST_BSF) || defined(__NO_TABLES) */
}
return nBit;
}
// No table-based version for this
INLINE int LastBit(const BitBoard bb)
{
int nBit;
__asm
{
mov edx, DWORD PTR [bb]
mov ecx, DWORD PTR [bb+4]
cmp ecx, 1
cmovc ecx, edx
sbb eax, eax
bsr ecx, ecx
inc eax
shl eax, 5
or eax, ecx
mov [nBit], eax
}
return nBit;
}
// Faster than MMX at the expense of extra registers (I think)
// COULD be faster yet!
INLINE int PopCount(BitBoard a)
{
int nCount;
__asm
{
mov eax, DWORD PTR [a]
mov ecx, DWORD PTR [a+4]
mov edx, eax
shr eax, 1
mov ebx, ecx
and eax, 0x55555555
shr ecx, 1
and ecx, 0x55555555
sub edx, eax
mov eax, edx
shr edx, 2
sub ebx, ecx
mov ecx, ebx
and eax, 0x33333333
and edx, 0x33333333
shr ebx, 2
add eax, edx
mov edx, eax
shr eax, 4
and ecx, 0x33333333
and ebx, 0x33333333
add eax, edx
and eax, 0x0F0F0F0F
add ecx, ebx
mov ebx, ecx
shr ecx, 4
add ecx, ebx
and ecx, 0x0F0F0F0F
add eax, ecx
imul eax, 0x01010101
shr eax, 24
mov [nCount], eax
}
return nCount;
}
-Matt
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.