Author: Eugene Nalimov
Date: 10:45:16 06/13/05
Go up one level in this thread
On June 13, 2005 at 13:23:53, Gerd Isenberg wrote:
>hi, compiler experts!
>
>Inside a recursive search routine (not alfa/beta but my fruit fly ;-) with only
>this-pointer and one additional integer parameter and local, msc2005 wastes 40
>bytes (72 with other optimizations) stackspace each call. A new stack
>defragmentation trick by ms? For 8-byte alignment those paddings seems a bit to
>huge. Each call eats one cacheline.
>Can someone please explain what's going on here ;-)
Calling conventions. You should reserve (I believe) 32 bytes on stack for
function you are calling. Extra 8 bytes are because stack should be 16-bytes
align, but on function entry it is 8 bytes aligned, and we are saving even
number of registers.
>I also wonder whether it is not possible for the compiler to keep the class
>members inside registers during the recursive search - dumb compiler ;-)
We were thinking about such optimization, but had to prune it due to some more
urgent needs. In any case you have indirect call in your function, so the
optimiziation would not fire even were it implemented.
Thanks,
Eugene
>Cheers,
>Gerd
>
>
> 00000 48 53 push rbx
> 00002 55 push rbp
> 00003 56 push rsi
> 00004 57 push rdi
> 00005 48 83 ec 28 sub rsp, 40 ; 00000028H !!!!!!!!!!!!!!!!!!!
> ....
>; rsp never refered!
> ....
> 000a1 48 83 c4 28 add rsp, 40 ; 00000028H
> 000a5 5f pop rdi
> 000a6 5e pop rsi
> 000a7 5d pop rbp
> 000a8 5b pop rbx
> 000a9 c3 ret 0
>
>source and some generated assembly:
>
>---------------------------------------------------------------
>
>#include <stdio.h>
>
>typedef unsigned __int64 BitBoard;
>
>BitBoard pow2[64] =
>{
> 0x0000000000000001,0x0000000000000002,0x0000000000000004,0x0000000000000008,
> 0x0000000000000010,0x0000000000000020,0x0000000000000040,0x0000000000000080,
> 0x0000000000000100,0x0000000000000200,0x0000000000000400,0x0000000000000800,
> 0x0000000000001000,0x0000000000002000,0x0000000000004000,0x0000000000008000,
> 0x0000000000010000,0x0000000000020000,0x0000000000040000,0x0000000000080000,
> 0x0000000000100000,0x0000000000200000,0x0000000000400000,0x0000000000800000,
> 0x0000000001000000,0x0000000002000000,0x0000000004000000,0x0000000008000000,
> 0x0000000010000000,0x0000000020000000,0x0000000040000000,0x0000000080000000,
> 0x0000000100000000,0x0000000200000000,0x0000000400000000,0x0000000800000000,
> 0x0000001000000000,0x0000002000000000,0x0000004000000000,0x0000008000000000,
> 0x0000010000000000,0x0000020000000000,0x0000040000000000,0x0000080000000000,
> 0x0000100000000000,0x0000200000000000,0x0000400000000000,0x0000800000000000,
> 0x0001000000000000,0x0002000000000000,0x0004000000000000,0x0008000000000000,
> 0x0010000000000000,0x0020000000000000,0x0040000000000000,0x0080000000000000,
> 0x0100000000000000,0x0200000000000000,0x0400000000000000,0x0800000000000000,
> 0x1000000000000000,0x2000000000000000,0x4000000000000000,0x8000000000000000
>};
>
>/*
> In 32-bit mode a
> {
> DeBruijnGenerator db;
> db.genDeBruijn(6);
> }
> takes about 71 seconds on my 2.2GHz box.
>*/
>
>class DeBruijnGenerator
>{
>protected:
> BitBoard lck, seq;
> unsigned int count, mask, depth;
>
> void searchDeBruijn(unsigned int i) {
> // i = (int)(seq >> depth) & mask;
> if ( depth > 1 ) {
> unsigned int j = (i+i)&mask; // next even index
> lck ^= pow2[i]; // lock current index
> depth--; // sequence index right
> if ( (lck & pow2[j]) == 0 ) { // next even index unlocked?
> seq &= ~pow2[depth]; // "append" zero
> searchDeBruijn(j); // do recursive search with next even index
> }
> if ( (lck & pow2[j+1]) == 0 ) { // next odd index unlocked?
> seq |= pow2[depth]; // "append" one
> searchDeBruijn(j+1); // do recursive search with next odd index
> }
> depth++; // sequence index left
> lck ^= pow2[i]; // unlock current index
> } else if ( (lck & pow2[(i+i+1)&mask]) == 0 ) {
> deBruijnFound(seq); // bit zero was already set during initialization
> count++;
> }
> }
> virtual void deBruijnFound(BitBoard deBruijn) const {};
>public:
> DeBruijnGenerator() {count = 0;}
> unsigned int getNoofFound() {return count;}
>
> void genDeBruijn(unsigned int expOfTwo){
> if ( expOfTwo > 6 ) {
> printf ("%d exceeds max exponent 6 for 64/6 sequence\n", expOfTwo);
> expOfTwo = 6;
> }
> unsigned int powOfTwo = 1 << expOfTwo;
> seq = 1; // the initial odd sequence
> lck = pow2[powOfTwo/2]; // lock last index for performance
> depth = powOfTwo-expOfTwo; // initial depth for expOfTwo leading zeros
> mask = powOfTwo-1;
> searchDeBruijn(0); // start with index 0 due to expOfTwo leading zeros
> }
>};
>
>---------------------------------------------------------------
>the 64-bit assembly of the recursive searchDeBruin routine:
>---------------------------------------------------------------
>
>; Listing generated by Microsoft (R) Optimizing Compiler Version 14.00.40607.16
>
>; Function compile flags: /Ogspy
>
>?searchDeBruijn@DeBruijnGenerator@@IEAIXI@Z PROC ;
>DeBruijnGenerator::searchDeBruijn, COMDAT
>; Line 643
>$LN8:
> 00000 48 53 push rbx
> 00002 55 push rbp
> 00003 56 push rsi
> 00004 57 push rdi
> 00005 48 83 ec 28 sub rsp, 40 ; 00000028H !!!!!!!!!!!!!!!!!!!
> 00009 48 8b d9 mov rbx, rcx
>; Line 644
> 0000c 8b 49 20 mov ecx, DWORD PTR [rcx+32]
>; Line 646
> 0000f 48 8d 3d 00 00
> 00 00 lea rdi, OFFSET FLAT:?pow2@@3PA_KA ; pow2
> 00016 83 f9 01 cmp ecx, 1
> 00019 76 63 jbe SHORT $LN5@searchDeBr
> 0001b 8b 73 1c mov esi, DWORD PTR [rbx+28]
> 0001e 8d 04 12 lea eax, DWORD PTR [rdx+rdx]
> 00021 8b ea mov ebp, edx
> 00023 23 f0 and esi, eax
> 00025 48 8b 04 ef mov rax, QWORD PTR [rdi+rbp*8]
> 00029 48 31 43 08 xor QWORD PTR [rbx+8], rax
>; Line 647
> 0002d ff c9 dec ecx
> 0002f 89 4b 20 mov DWORD PTR [rbx+32], ecx
>; Line 648
> 00032 48 8b 04 f7 mov rax, QWORD PTR [rdi+rsi*8]
> 00036 48 85 43 08 test rax, QWORD PTR [rbx+8]
> 0003a 75 15 jne SHORT $LN4@searchDeBr
>; Line 649
> 0003c 48 8b 0c cf mov rcx, QWORD PTR [rdi+rcx*8]
>; Line 650
> 00040 8b d6 mov edx, esi
> 00042 48 f7 d1 not rcx
> 00045 48 21 4b 10 and QWORD PTR [rbx+16], rcx
> 00049 48 8b cb mov rcx, rbx
> 0004c e8 00 00 00 00 call ?searchDeBruijn@DeBruijnGenerator@@IEAIXI@Z ;
>DeBruijnGenerator::searchDeBruijn
>$LN4@searchDeBr:
>; Line 652
> 00051 8d 56 01 lea edx, DWORD PTR [rsi+1]
> 00054 48 8b 04 d7 mov rax, QWORD PTR [rdi+rdx*8]
> 00058 48 85 43 08 test rax, QWORD PTR [rbx+8]
> 0005c 75 13 jne SHORT $LN3@searchDeBr
>; Line 653
> 0005e 8b 43 20 mov eax, DWORD PTR [rbx+32]
>; Line 654
> 00061 48 8b cb mov rcx, rbx
> 00064 48 8b 04 c7 mov rax, QWORD PTR [rdi+rax*8]
> 00068 48 09 43 10 or QWORD PTR [rbx+16], rax
> 0006c e8 00 00 00 00 call ?searchDeBruijn@DeBruijnGenerator@@IEAIXI@Z ;
>DeBruijnGenerator::searchDeBruijn
>$LN3@searchDeBr:
>; Line 656
> 00071 ff 43 20 inc DWORD PTR [rbx+32]
>; Line 657
> 00074 48 8b 04 ef mov rax, QWORD PTR [rdi+rbp*8]
> 00078 48 31 43 08 xor QWORD PTR [rbx+8], rax
> 0007c eb 23 jmp SHORT $LN1@searchDeBr
>$LN5@searchDeBr:
>; Line 658
> 0007e 8b 43 1c mov eax, DWORD PTR [rbx+28]
> 00081 8d 4c 12 01 lea ecx, DWORD PTR [rdx+rdx+1]
> 00085 48 23 c8 and rcx, rax
> 00088 48 8b 04 cf mov rax, QWORD PTR [rdi+rcx*8]
> 0008c 48 85 43 08 test rax, QWORD PTR [rbx+8]
> 00090 75 0f jne SHORT $LN1@searchDeBr
>; Line 659
> 00092 48 8b 03 mov rax, QWORD PTR [rbx]
> 00095 48 8b 53 10 mov rdx, QWORD PTR [rbx+16]
> 00099 48 8b cb mov rcx, rbx
> 0009c ff 10 call QWORD PTR [rax]
>; Line 660
> 0009e ff 43 18 inc DWORD PTR [rbx+24]
>$LN1@searchDeBr:
>; Line 662
> 000a1 48 83 c4 28 add rsp, 40 ; 00000028H
> 000a5 5f pop rdi
> 000a6 5e pop rsi
> 000a7 5d pop rbp
> 000a8 5b pop rbx
> 000a9 c3 ret 0
>?searchDeBruijn@DeBruijnGenerator@@IEAIXI@Z ENDP ;
>DeBruijnGenerator::searchDeBruijn
>_TEXT ENDS
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.