Author: Gerd Isenberg
Date: 11:12:35 01/19/03
Go up one level in this thread
On January 19, 2003 at 07:39:42, David Rasmussen wrote:
>Here
>
>http://www.talkchess.com/forums/1/message.html?278161
>
>you can read about the code generated by MSVC (6 or 7) and Intel C++ for simple
>64-bit constructs. I am wondering how much could be gained on x86 by defining
>this and other fundamental functions in assembly.
>
>Below are some inlined functions from my bitboard.h file. For people who know
>bitboards, most of it should be obvious. My orientation is a1=0 .. h8=63. My
>question to assembly programmers is: Can you make (hopefully faster) x86
>assembly equivalents of these functions, since it seems the compilers can't even
>generate good code for these simple things? I know x86/IA32 means many different
>things, but I am talking about newer processors. I don't care if it doesn't work
>on the original Pentium or K6. On the other hand, I would like to have "clean"
>x86 versions instead of versions that works only on P4 for example (I don't have
>a P4). Stuff that works on all newer processors such as MMX is ok, if it help
>etc. I have an Athlon XP 1800+ myself.
>Will you help me (and others)?
>
>(well these are some constants, but maybe they're intereting to some)
>const BitBoard lightSquares = 0x55aa55aa | (BitBoard(0x55aa55aa) << 32);
>const BitBoard darkSquares = ~lightSquares;
>const BitBoard center = 0x18000000 | BitBoard(0x00000018) << 32;
>
>//INLINE BitBoard Mask(Square square) { return BitBoard(1) << square; }
>INLINE BitBoard Mask(Square square) { return mask[square]; }
>INLINE BitBoard RankMask(Rank rank) { return rankMask[rank]; }
>INLINE BitBoard FileMask(File file) { return fileMask[file]; }
>INLINE void Set(BitBoard& bitboard, Square square)
>{ bitboard |= Mask(square); }
>INLINE void Clear(BitBoard& bitboard, Square square)
>{ bitboard &= ~Mask(square); }
>INLINE void Toggle(BitBoard& bitboard, BitBoard mask)
>{ bitboard ^= mask; }
>
>INLINE Square Rotate45Left(Square square) { return rotate45Left[square]; }
>INLINE Square Rotate45Right(Square square) { return rotate45Right[square]; }
>INLINE Square Rotate90Left(Square square) { return rotate90Left[square]; }
>INLINE Square UnRotate45Left(Square square) { return unrotate45Left[square]; }
>INLINE Square UnRotate45Right(Square square) { return unrotate45Right[square]; }
>INLINE Square UnRotate90Left(Square square) { return unrotate90Left[square]; }
>
>INLINE void SetOccupied(Position& pos, Square square, Color color)
>{
> Set(pos.occupied[color],square);
> Set(pos.occupiedRotate90Left,Rotate90Left(square));
> Set(pos.occupiedRotate45Left,Rotate45Left(square));
> Set(pos.occupiedRotate45Right,Rotate45Right(square));
>}
>
>INLINE void ClearOccupied(Position &pos, Square square, Color color)
>{
> Clear(pos.occupied[color],square);
> Clear(pos.occupiedRotate90Left,Rotate90Left(square));
> Clear(pos.occupiedRotate45Left,Rotate45Left(square));
> Clear(pos.occupiedRotate45Right,Rotate45Right(square));
>}
>
>INLINE int RankShift(Square square) { return rankShift[square]; }
>INLINE int FileShift(Square square) { return fileShift[square]; }
>
>INLINE int DiagonalShiftA1H8(Square square)
>{ return diagonalShiftA1H8[square]; }
>INLINE int DiagonalShiftH1A8(Square square)
>{ return diagonalShiftH1A8[square]; }
>
>INLINE BitBoard Occupied(const Position &pos)
>{ return pos.occupied[WHITE] | pos.occupied[BLACK]; }
>
>INLINE BitBoard FileAttack(const Position& pos, Square square)
>{
> return fileAttack
> [square]
> [int((pos.occupiedRotate90Left >> FileShift(square)) & 0xFF)];
>}
>
>INLINE BitBoard RankAttack(const Position& pos, Square square)
>{
> return rankAttack
> [square]
> [int((Occupied(pos) >> RankShift(square)) & 0xFF)];
>}
>
>INLINE BitBoard DiagonalAttackA1H8(const Position& pos, Square square)
>{
> return diagonalAttackA1H8
> [square]
> [int((pos.occupiedRotate45Right
> >> DiagonalShiftA1H8(square)) & 0xFF)];
>}
>
>INLINE BitBoard DiagonalAttackH1A8(const Position& pos, Square square)
>{
> return diagonalAttackH1A8
> [square]
> [int((pos.occupiedRotate45Left
> >> DiagonalShiftH1A8(square)) & 0xFF)];
>}
>
>INLINE BitBoard PawnAttack(Square square, Color color)
>{ return pawnAttack[color][square]; }
>
>INLINE BitBoard KnightAttack(Square square)
>{ return knightAttack[square]; }
>
>INLINE BitBoard BishopAttack(const Position &pos, Square square)
>{ return DiagonalAttackA1H8(pos,square) | DiagonalAttackH1A8(pos,square); }
>
>INLINE BitBoard RookAttack(const Position& pos, Square square)
>{ return RankAttack(pos,square) | FileAttack(pos,square); }
>
>INLINE BitBoard QueenAttack(const Position& pos, Square square)
>{ return RookAttack(pos,square) | BishopAttack(pos,square); }
>
>INLINE BitBoard KingAttack(Square square)
>{ return kingAttack[square]; }
>
Hi David,
that's all fine. I used rotated with 64*64 instead of 64*256 array, because the
outer occupied states don't matter, the inner six bit are enough.
>I don't remember where I got these, I probably stole them or copied them from
>discussions on CCC. Maybe they can be even faster:
>
>INLINE int FirstBit(const BitBoard bitboard)
>{
> __asm
> {
> mov ecx,dword ptr [bitboard+4]
> mov ebx,dword ptr [bitboard]
> bsf eax,ecx
> add eax,32
> bsf eax,ebx
> }
>}
>
>INLINE int LastBit(const BitBoard bitboard)
>{
> __asm
> {
> mov ecx,dword ptr [bitboard]
> mov ebx,dword ptr [bitboard+4]
> bsr eax,ecx
> sub eax,32
> bsr eax,ebx
> add eax,32
> }
>}
Why the register loads, you can do bsf/bsf directly with memory source operands:
bsf eax,[bitboard+4]
xor eax, 32
bsf eax,[bitboard]
>
>INLINE int PopCount(BitBoard a) // MMX
>{
> static const __int64 C55 = 0x5555555555555555;
> static const __int64 C33 = 0x3333333333333333;
> static const __int64 C0F = 0x0F0F0F0F0F0F0F0F;
>
> __asm {
> movd mm0, word ptr a;
> punpckldq mm0, word ptr a + 4;
> movq mm1, mm0;
> psrld mm0, 1;
> pand mm0, [C55];
> psubd mm1, mm0;
> movq mm0, mm1;
> psrld mm1, 2;
> pand mm0, [C33];
> pand mm1, [C33];
> paddd mm0, mm1;
> movq mm1, mm0;
> psrld mm0, 4;
> paddd mm0, mm1;
> pand mm0, [C0F];
> pxor mm1, mm1;
> psadbw mm0, mm1;
> movd eax, mm0;
> emms; femms for athlon is faster
// or skip emms at all, if you don't use float
> }
>}
I found this modified one slightly faster (saves a few bytes):
Regards,
Gerd
---------------------------------------------------------------
struct SBitCountConsts
{
BitBoard C55;
BitBoard C33;
BitBoard C0F;
...
};
extern const SBitCountConsts BitCountConsts;
__forceinline
int PopCount (BitBoard bb)
{
__asm
{
movd mm0, word ptr bb
punpckldq mm0, word ptr bb + 4
lea eax, [BitCountConsts]
movq mm1, mm0
psrld mm0, 1
pand mm0, [eax].C55
psubd mm1, mm0
movq mm0, mm1
psrld mm1, 2
pand mm0, [eax].C33
pand mm1, [eax].C33
paddd mm0, mm1
movq mm1, mm0
psrld mm0, 4
paddd mm0, mm1
pand mm0, [eax].C0F
pxor mm1, mm1
psadbw mm0, mm1
movd eax, mm0
}
}
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.