Author: Gerd Isenberg
Date: 10:23:53 06/13/05
hi, compiler experts!
Inside a recursive search routine (not alfa/beta but my fruit fly ;-) with only
this-pointer and one additional integer parameter and local, msc2005 wastes 40
bytes (72 with other optimizations) stackspace each call. A new stack
defragmentation trick by ms? For 8-byte alignment those paddings seems a bit to
huge. Each call eats one cacheline.
Can someone please explain what's going on here ;-)
I also wonder whether it is not possible for the compiler to keep the class
members inside registers during the recursive search - dumb compiler ;-)
Cheers,
Gerd
00000 48 53 push rbx
00002 55 push rbp
00003 56 push rsi
00004 57 push rdi
00005 48 83 ec 28 sub rsp, 40 ; 00000028H !!!!!!!!!!!!!!!!!!!
....
; rsp never refered!
....
000a1 48 83 c4 28 add rsp, 40 ; 00000028H
000a5 5f pop rdi
000a6 5e pop rsi
000a7 5d pop rbp
000a8 5b pop rbx
000a9 c3 ret 0
source and some generated assembly:
---------------------------------------------------------------
#include <stdio.h>
typedef unsigned __int64 BitBoard;
BitBoard pow2[64] =
{
0x0000000000000001,0x0000000000000002,0x0000000000000004,0x0000000000000008,
0x0000000000000010,0x0000000000000020,0x0000000000000040,0x0000000000000080,
0x0000000000000100,0x0000000000000200,0x0000000000000400,0x0000000000000800,
0x0000000000001000,0x0000000000002000,0x0000000000004000,0x0000000000008000,
0x0000000000010000,0x0000000000020000,0x0000000000040000,0x0000000000080000,
0x0000000000100000,0x0000000000200000,0x0000000000400000,0x0000000000800000,
0x0000000001000000,0x0000000002000000,0x0000000004000000,0x0000000008000000,
0x0000000010000000,0x0000000020000000,0x0000000040000000,0x0000000080000000,
0x0000000100000000,0x0000000200000000,0x0000000400000000,0x0000000800000000,
0x0000001000000000,0x0000002000000000,0x0000004000000000,0x0000008000000000,
0x0000010000000000,0x0000020000000000,0x0000040000000000,0x0000080000000000,
0x0000100000000000,0x0000200000000000,0x0000400000000000,0x0000800000000000,
0x0001000000000000,0x0002000000000000,0x0004000000000000,0x0008000000000000,
0x0010000000000000,0x0020000000000000,0x0040000000000000,0x0080000000000000,
0x0100000000000000,0x0200000000000000,0x0400000000000000,0x0800000000000000,
0x1000000000000000,0x2000000000000000,0x4000000000000000,0x8000000000000000
};
/*
In 32-bit mode a
{
DeBruijnGenerator db;
db.genDeBruijn(6);
}
takes about 71 seconds on my 2.2GHz box.
*/
class DeBruijnGenerator
{
protected:
BitBoard lck, seq;
unsigned int count, mask, depth;
void searchDeBruijn(unsigned int i) {
// i = (int)(seq >> depth) & mask;
if ( depth > 1 ) {
unsigned int j = (i+i)&mask; // next even index
lck ^= pow2[i]; // lock current index
depth--; // sequence index right
if ( (lck & pow2[j]) == 0 ) { // next even index unlocked?
seq &= ~pow2[depth]; // "append" zero
searchDeBruijn(j); // do recursive search with next even index
}
if ( (lck & pow2[j+1]) == 0 ) { // next odd index unlocked?
seq |= pow2[depth]; // "append" one
searchDeBruijn(j+1); // do recursive search with next odd index
}
depth++; // sequence index left
lck ^= pow2[i]; // unlock current index
} else if ( (lck & pow2[(i+i+1)&mask]) == 0 ) {
deBruijnFound(seq); // bit zero was already set during initialization
count++;
}
}
virtual void deBruijnFound(BitBoard deBruijn) const {};
public:
DeBruijnGenerator() {count = 0;}
unsigned int getNoofFound() {return count;}
void genDeBruijn(unsigned int expOfTwo){
if ( expOfTwo > 6 ) {
printf ("%d exceeds max exponent 6 for 64/6 sequence\n", expOfTwo);
expOfTwo = 6;
}
unsigned int powOfTwo = 1 << expOfTwo;
seq = 1; // the initial odd sequence
lck = pow2[powOfTwo/2]; // lock last index for performance
depth = powOfTwo-expOfTwo; // initial depth for expOfTwo leading zeros
mask = powOfTwo-1;
searchDeBruijn(0); // start with index 0 due to expOfTwo leading zeros
}
};
---------------------------------------------------------------
the 64-bit assembly of the recursive searchDeBruin routine:
---------------------------------------------------------------
; Listing generated by Microsoft (R) Optimizing Compiler Version 14.00.40607.16
; Function compile flags: /Ogspy
?searchDeBruijn@DeBruijnGenerator@@IEAIXI@Z PROC ;
DeBruijnGenerator::searchDeBruijn, COMDAT
; Line 643
$LN8:
00000 48 53 push rbx
00002 55 push rbp
00003 56 push rsi
00004 57 push rdi
00005 48 83 ec 28 sub rsp, 40 ; 00000028H !!!!!!!!!!!!!!!!!!!
00009 48 8b d9 mov rbx, rcx
; Line 644
0000c 8b 49 20 mov ecx, DWORD PTR [rcx+32]
; Line 646
0000f 48 8d 3d 00 00
00 00 lea rdi, OFFSET FLAT:?pow2@@3PA_KA ; pow2
00016 83 f9 01 cmp ecx, 1
00019 76 63 jbe SHORT $LN5@searchDeBr
0001b 8b 73 1c mov esi, DWORD PTR [rbx+28]
0001e 8d 04 12 lea eax, DWORD PTR [rdx+rdx]
00021 8b ea mov ebp, edx
00023 23 f0 and esi, eax
00025 48 8b 04 ef mov rax, QWORD PTR [rdi+rbp*8]
00029 48 31 43 08 xor QWORD PTR [rbx+8], rax
; Line 647
0002d ff c9 dec ecx
0002f 89 4b 20 mov DWORD PTR [rbx+32], ecx
; Line 648
00032 48 8b 04 f7 mov rax, QWORD PTR [rdi+rsi*8]
00036 48 85 43 08 test rax, QWORD PTR [rbx+8]
0003a 75 15 jne SHORT $LN4@searchDeBr
; Line 649
0003c 48 8b 0c cf mov rcx, QWORD PTR [rdi+rcx*8]
; Line 650
00040 8b d6 mov edx, esi
00042 48 f7 d1 not rcx
00045 48 21 4b 10 and QWORD PTR [rbx+16], rcx
00049 48 8b cb mov rcx, rbx
0004c e8 00 00 00 00 call ?searchDeBruijn@DeBruijnGenerator@@IEAIXI@Z ;
DeBruijnGenerator::searchDeBruijn
$LN4@searchDeBr:
; Line 652
00051 8d 56 01 lea edx, DWORD PTR [rsi+1]
00054 48 8b 04 d7 mov rax, QWORD PTR [rdi+rdx*8]
00058 48 85 43 08 test rax, QWORD PTR [rbx+8]
0005c 75 13 jne SHORT $LN3@searchDeBr
; Line 653
0005e 8b 43 20 mov eax, DWORD PTR [rbx+32]
; Line 654
00061 48 8b cb mov rcx, rbx
00064 48 8b 04 c7 mov rax, QWORD PTR [rdi+rax*8]
00068 48 09 43 10 or QWORD PTR [rbx+16], rax
0006c e8 00 00 00 00 call ?searchDeBruijn@DeBruijnGenerator@@IEAIXI@Z ;
DeBruijnGenerator::searchDeBruijn
$LN3@searchDeBr:
; Line 656
00071 ff 43 20 inc DWORD PTR [rbx+32]
; Line 657
00074 48 8b 04 ef mov rax, QWORD PTR [rdi+rbp*8]
00078 48 31 43 08 xor QWORD PTR [rbx+8], rax
0007c eb 23 jmp SHORT $LN1@searchDeBr
$LN5@searchDeBr:
; Line 658
0007e 8b 43 1c mov eax, DWORD PTR [rbx+28]
00081 8d 4c 12 01 lea ecx, DWORD PTR [rdx+rdx+1]
00085 48 23 c8 and rcx, rax
00088 48 8b 04 cf mov rax, QWORD PTR [rdi+rcx*8]
0008c 48 85 43 08 test rax, QWORD PTR [rbx+8]
00090 75 0f jne SHORT $LN1@searchDeBr
; Line 659
00092 48 8b 03 mov rax, QWORD PTR [rbx]
00095 48 8b 53 10 mov rdx, QWORD PTR [rbx+16]
00099 48 8b cb mov rcx, rbx
0009c ff 10 call QWORD PTR [rax]
; Line 660
0009e ff 43 18 inc DWORD PTR [rbx+24]
$LN1@searchDeBr:
; Line 662
000a1 48 83 c4 28 add rsp, 40 ; 00000028H
000a5 5f pop rdi
000a6 5e pop rsi
000a7 5d pop rbp
000a8 5b pop rbx
000a9 c3 ret 0
?searchDeBruijn@DeBruijnGenerator@@IEAIXI@Z ENDP ;
DeBruijnGenerator::searchDeBruijn
_TEXT ENDS
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.