Computer Chess Club Archives


Search

Terms

Messages

Subject: Re: Assembly Programmers Challenge!

Author: Gerd Isenberg

Date: 11:12:35 01/19/03

Go up one level in this thread


On January 19, 2003 at 07:39:42, David Rasmussen wrote:

>Here
>
>http://www.talkchess.com/forums/1/message.html?278161
>
>you can read about the code generated by MSVC (6 or 7) and Intel C++ for simple
>64-bit constructs. I am wondering how much could be gained on x86 by defining
>this and other fundamental functions in assembly.
>
>Below are some inlined functions from my bitboard.h file. For people who know
>bitboards, most of it should be obvious. My orientation is a1=0 .. h8=63. My
>question to assembly programmers is: Can you make (hopefully faster) x86
>assembly equivalents of these functions, since it seems the compilers can't even
>generate good code for these simple things? I know x86/IA32 means many different
>things, but I am talking about newer processors. I don't care if it doesn't work
>on the original Pentium or K6. On the other hand, I would like to have "clean"
>x86 versions instead of versions that works only on P4 for example (I don't have
>a P4). Stuff that works on all newer processors such as MMX is ok, if it help
>etc. I have an Athlon XP 1800+ myself.
>Will you help me (and others)?
>
>(well these are some constants, but maybe they're intereting to some)
>const BitBoard lightSquares = 0x55aa55aa | (BitBoard(0x55aa55aa) << 32);
>const BitBoard darkSquares = ~lightSquares;
>const BitBoard center = 0x18000000 | BitBoard(0x00000018) << 32;
>
>//INLINE BitBoard Mask(Square square) { return BitBoard(1) << square; }
>INLINE BitBoard Mask(Square square) { return mask[square]; }
>INLINE BitBoard RankMask(Rank rank) { return rankMask[rank]; }
>INLINE BitBoard FileMask(File file) { return fileMask[file]; }
>INLINE void Set(BitBoard& bitboard, Square square)
>{ bitboard |= Mask(square); }
>INLINE void Clear(BitBoard& bitboard, Square square)
>{ bitboard &= ~Mask(square); }
>INLINE void Toggle(BitBoard& bitboard, BitBoard mask)
>{ bitboard ^= mask; }
>
>INLINE Square Rotate45Left(Square square) { return rotate45Left[square]; }
>INLINE Square Rotate45Right(Square square) { return rotate45Right[square]; }
>INLINE Square Rotate90Left(Square square) { return rotate90Left[square]; }
>INLINE Square UnRotate45Left(Square square) { return unrotate45Left[square]; }
>INLINE Square UnRotate45Right(Square square) { return unrotate45Right[square]; }
>INLINE Square UnRotate90Left(Square square) { return unrotate90Left[square]; }
>
>INLINE void SetOccupied(Position& pos, Square square, Color color)
>{
>	Set(pos.occupied[color],square);
>	Set(pos.occupiedRotate90Left,Rotate90Left(square));
>	Set(pos.occupiedRotate45Left,Rotate45Left(square));
>	Set(pos.occupiedRotate45Right,Rotate45Right(square));
>}
>
>INLINE void ClearOccupied(Position &pos, Square square, Color color)
>{
>	Clear(pos.occupied[color],square);
>	Clear(pos.occupiedRotate90Left,Rotate90Left(square));
>	Clear(pos.occupiedRotate45Left,Rotate45Left(square));
>	Clear(pos.occupiedRotate45Right,Rotate45Right(square));
>}
>
>INLINE int RankShift(Square square) { return rankShift[square]; }
>INLINE int FileShift(Square square) { return fileShift[square]; }
>
>INLINE int DiagonalShiftA1H8(Square square)
>{ return diagonalShiftA1H8[square]; }
>INLINE int DiagonalShiftH1A8(Square square)
>{ return diagonalShiftH1A8[square]; }
>
>INLINE BitBoard Occupied(const Position &pos)
>{ return pos.occupied[WHITE] | pos.occupied[BLACK]; }
>
>INLINE BitBoard FileAttack(const Position& pos, Square square)
>{
>	return fileAttack
>		[square]
>		[int((pos.occupiedRotate90Left >> FileShift(square)) & 0xFF)];
>}
>
>INLINE BitBoard RankAttack(const Position& pos, Square square)
>{
>	return rankAttack
>		[square]
>		[int((Occupied(pos) >> RankShift(square)) & 0xFF)];
>}
>
>INLINE BitBoard DiagonalAttackA1H8(const Position& pos, Square square)
>{
>	return diagonalAttackA1H8
>		[square]
>		[int((pos.occupiedRotate45Right
>			>> DiagonalShiftA1H8(square)) & 0xFF)];
>}
>
>INLINE BitBoard DiagonalAttackH1A8(const Position& pos, Square square)
>{
>	return diagonalAttackH1A8
>		[square]
>		[int((pos.occupiedRotate45Left
>			>> DiagonalShiftH1A8(square)) & 0xFF)];
>}
>
>INLINE BitBoard PawnAttack(Square square, Color color)
>{ return pawnAttack[color][square]; }
>
>INLINE BitBoard KnightAttack(Square square)
>{ return knightAttack[square]; }
>
>INLINE BitBoard BishopAttack(const Position &pos, Square square)
>{ return DiagonalAttackA1H8(pos,square) | DiagonalAttackH1A8(pos,square); }
>
>INLINE BitBoard RookAttack(const Position& pos, Square square)
>{ return RankAttack(pos,square) | FileAttack(pos,square); }
>
>INLINE BitBoard QueenAttack(const Position& pos, Square square)
>{ return RookAttack(pos,square) | BishopAttack(pos,square); }
>
>INLINE BitBoard KingAttack(Square square)
>{ return kingAttack[square]; }
>

Hi David,

that's all fine. I used rotated with 64*64 instead of 64*256 array, because the
outer occupied states don't matter, the inner six bit are enough.


>I don't remember where I got these, I probably stole them or copied them from
>discussions on CCC. Maybe they can be even faster:
>
>INLINE int FirstBit(const BitBoard bitboard)
>{
>	__asm
>	{
>		mov ecx,dword ptr [bitboard+4]
>		mov ebx,dword ptr [bitboard]
>		bsf eax,ecx
>		add eax,32
>		bsf eax,ebx
>	}
>}
>
>INLINE int LastBit(const BitBoard bitboard)
>{
>	__asm
>	{
>		mov ecx,dword ptr [bitboard]
>		mov ebx,dword ptr [bitboard+4]
>		bsr eax,ecx
>		sub eax,32
>		bsr eax,ebx
>		add eax,32
>	}
>}


Why the register loads, you can do bsf/bsf directly with memory source operands:

		bsf		eax,[bitboard+4]
		xor		eax, 32
		bsf		eax,[bitboard]



>
>INLINE int PopCount(BitBoard a) // MMX
>{
>    static const __int64 C55 = 0x5555555555555555;
>    static const __int64 C33 = 0x3333333333333333;
>    static const __int64 C0F = 0x0F0F0F0F0F0F0F0F;
>
>    __asm {
>        movd            mm0, word ptr a;
>        punpckldq       mm0, word ptr a + 4;
>        movq            mm1, mm0;
>        psrld           mm0, 1;
>        pand            mm0, [C55];
>        psubd           mm1, mm0;
>        movq            mm0, mm1;
>        psrld           mm1, 2;
>        pand            mm0, [C33];
>        pand            mm1, [C33];
>        paddd           mm0, mm1;
>        movq            mm1, mm0;
>        psrld           mm0, 4;
>        paddd           mm0, mm1;
>        pand            mm0, [C0F];
>        pxor            mm1, mm1;
>        psadbw mm0, mm1;
>        movd            eax, mm0;
>        emms;  femms for athlon is faster
//  or skip emms at all, if you don't use float
>    }
>}


I found this modified one slightly faster (saves a few bytes):

Regards,
Gerd

---------------------------------------------------------------

struct SBitCountConsts
{
	BitBoard C55;
	BitBoard C33;
	BitBoard C0F;
        ...
};
extern const SBitCountConsts BitCountConsts;

__forceinline
int PopCount (BitBoard bb)
{
	__asm
	{
		movd  mm0, word ptr bb
		punpckldq mm0, word ptr bb + 4
		lea   eax, [BitCountConsts]
		movq  mm1, mm0
		psrld mm0, 1
		pand  mm0, [eax].C55
		psubd mm1, mm0
		movq  mm0, mm1
		psrld mm1, 2
		pand  mm0, [eax].C33
		pand  mm1, [eax].C33
		paddd mm0, mm1
		movq  mm1, mm0
		psrld mm0, 4
		paddd mm0, mm1
		pand  mm0, [eax].C0F
		pxor  mm1, mm1
		psadbw mm0, mm1
		movd  eax, mm0
	}
}




This page took 0 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.