Computer Chess Club Archives

Search

Terms
Messages

Subject: Re: Assembly Programmers Challenge! (repost and clarification)

Author: Matt Taylor
Date: 23:47:25 01/21/03
On January 21, 2003 at 18:42:47, David Rasmussen wrote:

>Thanks a lot for the answers!
>
>It is disappointing to hear that it cannot be done on MSVC/Intel. How about gcc?
>Is it possible to make such fast assembler inline functions for gcc?
>Will anybody do it?
>
>/David

It is (sort've) possible on gcc. MSVC/Intel have the convenient asm-block
syntax. Since there is no way to tell the compiler where you want things and
where you put your outputs, it has to be passed through memory. Worse yet, it
has the added side-effect that the compiler has to stash unwritten temp values
to memory (in case you trash the register they occupy).

GCC is much more annoying to deal with syntactically (since you have to format
the assembly pedantically for gas), but it allows for better optimization
opportunities. You make a contract with gcc. You tell gcc which locations are
acceptable (memory, register, etc.) for inputs and outputs. The compiler will
then arrange for the value to be in an appropriate location for you.

It is trivial to convert MSVC/Intel -> GCC.

I'll post what I've done thus far below so that it can be properly criticized,
and you can incorporate it also. I've made notes about the speed of the
functions relative to their C equivalents as well as other suggestions. It is
worth reading the comments.

Note that all cycle counts are estimates. Athlon has some weird issued with
back-to-back cmov/set instructions, and I have not actually -measured- the speed
of any of these routines.

// Better than call to shift function
BitBoard Mask(Square square)
{
	BitBoard bb;

	_asm
	{
		; 4 cycles + memory penalties
		mov	ecx, square
		mov	edx, 1

		shl	edx, cl
		test	ecx, 32
		mov	ecx, 0

		cmovz	eax, edx
		cmovz	edx, ecx

		mov	DWORD PTR [bb], eax
		mov	DWORD PTR [bb+4], edx
	}

	return bb;
}

// Push shift back to here to avoid unnecessary manipulation of constants
// Alternatively, if you are able to, change the values of the rank constants.
// You may be playing games elsewhere with the rank, but if you extract rank
// from a square, you can mask instead of shifting.
#define RankMask(r) RankMask1((r) << 3)

// Roughly equal in speed to table-based, less memory utilization
BitBoard RankMask1(Rank rank)
{
	BitBoard bb;

	_asm
	{
		; 4 cycles + memory penalties
		mov	ecx, rank
		mov	eax, 0xFF

		shl	eax, cl
		test	ecx, 32
		mov	ecx, 0

		cmovnz	edx, eax
		cmovnz	eax, ecx

		mov	DWORD PTR [bb], eax
		mov	DWORD PTR [bb+4], edx
	}

	return bb;
}

// Roughly equal in speed to table-based, less memory utilization
BitBoard FileMask(File file)
{
	BitBoard bb;

	_asm
	{
		mov	eax, 0x01010101
		mov	ecx, file

		shl	eax, cl

		mov	DWORD PTR [bb], eax
		mov	DWORD PTR [bb+4], eax
	}

	return bb;
}

// Roughly equal in speed to table-based, less memory utilization
void Set(BitBoard &bb, Square square)
{
	_asm
	{
		mov	ecx, square
		mov	edx, 1
		mov	eax, bb

		shl	edx, cl
		shr	ecx, 5

		or	[eax+ecx*4], edx
	}
}

// Roughly equal in speed to table-based, less memory utilization
void Clear(BitBoard &bb, Square square)
{
	_asm
	{
		mov	ecx, square
		mov	edx, -2
		mov	eax, bb

		rol	edx, cl
		shr	ecx, 5

		and	[eax+ecx*4], edx
	}
}

// Roughly equal in speed to table-based, less memory utilization
void Toggle(BitBoard &bb, Square square)
{
	_asm
	{
		mov	ecx, square
		mov	edx, 1
		mov	eax, bb

		shl	edx, cl
		shl	ecx, 5

		xor	[eax+ecx*4], edx
	}
}

// define __FAST_BSF for Pentium 2/3
// define __NO_TABLES to compare performance against the table-based version
#if !defined(__FAST_BSF) && !defined(__NO_TABLES)
#define LSB_32_SIZE             134

// Fast LSB algorithm courtesy of Walter Faxon!
static const uint8 LSB_32_table[LSB_32_SIZE * 2] =
{
#define __  0
	23,16,17,__,18,10, 8, 9,19,11, 31,__,__,__,__,__,20,12,__,__,
	__,__,__,__,__,__,__,__,__,__, __,__,21,13,__,__,__,__,__,__,
	__,__,__,__,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__,
	__,__,__,__,22,14,__,__,__,__,  6,__,__,__,30,__,__,__,__,__,
	__,__,__,__,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__,
	__,__, 5,__,__,__,29,__,__,__,  3,__,__,__, 2,__, 1, 0, 4,__,
	__,__,28,__,__,__,26,24,25,15, 27,__,__, 7,

	55,48,49,__,50,42,40,41,51,43, 63,__,__,__,__,__,52,44,__,__,
	__,__,__,__,__,__,__,__,__,__, __,__,53,45,__,__,__,__,__,__,
	__,__,__,__,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__,
	__,__,__,__,54,46,__,__,__,__, 38,__,__,__,62,__,__,__,__,__,
	__,__,__,__,__,__,__,__,__,__, __,__,__,__,__,__,__,__,__,__,
	__,__,37,__,__,__,61,__,__,__, 35,__,__,__,34,__,33,32,36,__,
	__,__,60,__,__,__,58,56,57,47, 59,__,__,39
#undef __
};
#endif /* !defined(__FAST_BSF) && !defined(__NO_TABLES) */

INLINE int FirstBit(const BitBoard bb)
{
	int nBit;

	__asm
	{
#if defined(__FAST_BSF) || defined(__NO_TABLES)
		mov	ecx, DWORD PTR [bb]
		mov	edx, DWORD PTR [bb+4]
		xor	eax, eax

		cmp	ecx, 1

		cmovc	ecx, edx
		adc	eax, eax
		mov	edx, 1

		bsf	ecx, ecx

		shl	edx, cl
		shl	eax, 5

		or	eax, ecx

		mov	[nBit], eax
#else
		mov	edx, DWORD PTR [bb]
		mov	ecx, DWORD PTR [bb+4]
		xor	eax, eax

		cmp	edx, 1

		cmovc	edx, ecx
		adc	eax, eax

		mov	ecx, edx
		neg	edx
		shl	eax, 5

		and	edx, ecx
		mov	[nBit], eax

		dec	edx

		xor	edx, 0x05407DB7

		mov	ecx, edx

		shr	ecx, 16

		add	edx, ecx

		sub	dl, dh

		movzx	ecx, dl
		xor	edx, edx

		mov	cl, BYTE PTR [LSB_32_table+ecx]

		mov	[nBit], cl
#endif /* defined(__FAST_BSF) || defined(__NO_TABLES) */
	}

	return nBit;
}

// No table-based version for this
INLINE int LastBit(const BitBoard bb)
{
	int nBit;

	__asm
	{
		mov	edx, DWORD PTR [bb]
		mov	ecx, DWORD PTR [bb+4]

		cmp	ecx, 1

		cmovc	ecx, edx
		sbb	eax, eax

		bsr	ecx, ecx

		inc	eax

		shl	eax, 5

		or	eax, ecx

		mov	[nBit], eax
	}

	return nBit;
}

// Faster than MMX at the expense of extra registers (I think)
// COULD be faster yet!
INLINE int PopCount(BitBoard a)
{
	int nCount;

	__asm
	{
		mov	eax, DWORD PTR [a]
		mov	ecx, DWORD PTR [a+4]

		mov	edx, eax
		shr	eax, 1
		mov	ebx, ecx

		and	eax, 0x55555555
		shr	ecx, 1

		and	ecx, 0x55555555
		sub	edx, eax

		mov	eax, edx
		shr	edx, 2
		sub	ebx, ecx

		mov	ecx, ebx
		and	eax, 0x33333333
		and	edx, 0x33333333

		shr	ebx, 2
		add	eax, edx

		mov	edx, eax
		shr	eax, 4
		and	ecx, 0x33333333

		and	ebx, 0x33333333
		add	eax, edx

		and	eax, 0x0F0F0F0F
		add	ecx, ebx

		mov	ebx, ecx
		shr	ecx, 4

		add	ecx, ebx

		and	ecx, 0x0F0F0F0F

		add	eax, ecx

		imul	eax, 0x01010101

		shr	eax, 24

		mov	[nCount], eax
	}

	return nCount;
}

-Matt
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.