Computer Chess Club Archives


Search

Terms

Messages

Subject: odd msc 2005 behaviour - 64-bit mode

Author: Gerd Isenberg

Date: 10:23:53 06/13/05


hi, compiler experts!

Inside a recursive search routine (not alfa/beta but my fruit fly ;-) with only
this-pointer and one additional integer parameter and local, msc2005 wastes 40
bytes (72 with other optimizations) stackspace each call. A new stack
defragmentation trick by ms? For 8-byte alignment those paddings seems a bit to
huge. Each call eats one cacheline.
Can someone please explain what's going on here ;-)

I also wonder whether it is not possible for the compiler to keep the class
members inside registers during the recursive search - dumb compiler ;-)

Cheers,
Gerd


  00000	48 53		 push	 rbx
  00002	55		 push	 rbp
  00003	56		 push	 rsi
  00004	57		 push	 rdi
  00005	48 83 ec 28	 sub	 rsp, 40     ; 00000028H !!!!!!!!!!!!!!!!!!!
  ....
; rsp never refered!
  ....
  000a1	48 83 c4 28	 add	 rsp, 40			; 00000028H
  000a5	5f		 pop	 rdi
  000a6	5e		 pop	 rsi
  000a7	5d		 pop	 rbp
  000a8	5b		 pop	 rbx
  000a9	c3		 ret	 0

source and some generated assembly:

---------------------------------------------------------------

#include <stdio.h>

typedef unsigned __int64 BitBoard;

BitBoard pow2[64] =
{
 0x0000000000000001,0x0000000000000002,0x0000000000000004,0x0000000000000008,
 0x0000000000000010,0x0000000000000020,0x0000000000000040,0x0000000000000080,
 0x0000000000000100,0x0000000000000200,0x0000000000000400,0x0000000000000800,
 0x0000000000001000,0x0000000000002000,0x0000000000004000,0x0000000000008000,
 0x0000000000010000,0x0000000000020000,0x0000000000040000,0x0000000000080000,
 0x0000000000100000,0x0000000000200000,0x0000000000400000,0x0000000000800000,
 0x0000000001000000,0x0000000002000000,0x0000000004000000,0x0000000008000000,
 0x0000000010000000,0x0000000020000000,0x0000000040000000,0x0000000080000000,
 0x0000000100000000,0x0000000200000000,0x0000000400000000,0x0000000800000000,
 0x0000001000000000,0x0000002000000000,0x0000004000000000,0x0000008000000000,
 0x0000010000000000,0x0000020000000000,0x0000040000000000,0x0000080000000000,
 0x0000100000000000,0x0000200000000000,0x0000400000000000,0x0000800000000000,
 0x0001000000000000,0x0002000000000000,0x0004000000000000,0x0008000000000000,
 0x0010000000000000,0x0020000000000000,0x0040000000000000,0x0080000000000000,
 0x0100000000000000,0x0200000000000000,0x0400000000000000,0x0800000000000000,
 0x1000000000000000,0x2000000000000000,0x4000000000000000,0x8000000000000000
};

/*
  In 32-bit mode a
  {
    DeBruijnGenerator db;
    db.genDeBruijn(6);
  }
  takes about 71 seconds on my 2.2GHz box.
*/

class DeBruijnGenerator
{
protected:
  BitBoard lck, seq;
  unsigned int count, mask, depth;

  void  searchDeBruijn(unsigned int i) {
    //	i = (int)(seq >> depth) & mask;
    if ( depth > 1 ) {
      unsigned int j = (i+i)&mask; // next even index
      lck ^= pow2[i];	// lock current index
      depth--;		// sequence index right
      if ( (lck & pow2[j]) == 0 ) { // next even index unlocked?
        seq &= ~pow2[depth]; // "append" zero
        searchDeBruijn(j);   // do recursive search with next even index
      }
      if ( (lck & pow2[j+1]) == 0 ) { // next odd index unlocked?
        seq |=  pow2[depth];    // "append" one
        searchDeBruijn(j+1);	// do recursive search with next odd index
      }
      depth++;		// sequence index left
      lck ^= pow2[i];	// unlock current index
    } else if ( (lck & pow2[(i+i+1)&mask]) == 0 ) {
      deBruijnFound(seq); // bit zero was already set during initialization
      count++;
    }
  }
  virtual void deBruijnFound(BitBoard deBruijn) const {};
public:
  DeBruijnGenerator() {count = 0;}
  unsigned int getNoofFound() {return count;}

  void genDeBruijn(unsigned int expOfTwo){
    if ( expOfTwo > 6 ) {
      printf ("%d exceeds max exponent 6 for 64/6 sequence\n", expOfTwo);
      expOfTwo = 6;
    }
    unsigned int powOfTwo = 1 << expOfTwo;
    seq   = 1;		       // the initial odd sequence
    lck   = pow2[powOfTwo/2];  // lock last index for performance
    depth = powOfTwo-expOfTwo; // initial depth for expOfTwo leading zeros
    mask  = powOfTwo-1;
    searchDeBruijn(0);  // start with index 0 due to expOfTwo leading zeros
  }
};

---------------------------------------------------------------
the 64-bit assembly of the recursive searchDeBruin routine:
---------------------------------------------------------------

; Listing generated by Microsoft (R) Optimizing Compiler Version 14.00.40607.16

; Function compile flags: /Ogspy

?searchDeBruijn@DeBruijnGenerator@@IEAIXI@Z PROC	;
DeBruijnGenerator::searchDeBruijn, COMDAT
; Line 643
$LN8:
  00000	48 53		 push	 rbx
  00002	55		 push	 rbp
  00003	56		 push	 rsi
  00004	57		 push	 rdi
  00005	48 83 ec 28	 sub	 rsp, 40     ; 00000028H !!!!!!!!!!!!!!!!!!!
  00009	48 8b d9	 mov	 rbx, rcx
; Line 644
  0000c	8b 49 20	 mov	 ecx, DWORD PTR [rcx+32]
; Line 646
  0000f	48 8d 3d 00 00
	00 00		 lea	 rdi, OFFSET FLAT:?pow2@@3PA_KA ; pow2
  00016	83 f9 01	 cmp	 ecx, 1
  00019	76 63		 jbe	 SHORT $LN5@searchDeBr
  0001b	8b 73 1c	 mov	 esi, DWORD PTR [rbx+28]
  0001e	8d 04 12	 lea	 eax, DWORD PTR [rdx+rdx]
  00021	8b ea		 mov	 ebp, edx
  00023	23 f0		 and	 esi, eax
  00025	48 8b 04 ef	 mov	 rax, QWORD PTR [rdi+rbp*8]
  00029	48 31 43 08	 xor	 QWORD PTR [rbx+8], rax
; Line 647
  0002d	ff c9		 dec	 ecx
  0002f	89 4b 20	 mov	 DWORD PTR [rbx+32], ecx
; Line 648
  00032	48 8b 04 f7	 mov	 rax, QWORD PTR [rdi+rsi*8]
  00036	48 85 43 08	 test	 rax, QWORD PTR [rbx+8]
  0003a	75 15		 jne	 SHORT $LN4@searchDeBr
; Line 649
  0003c	48 8b 0c cf	 mov	 rcx, QWORD PTR [rdi+rcx*8]
; Line 650
  00040	8b d6		 mov	 edx, esi
  00042	48 f7 d1	 not	 rcx
  00045	48 21 4b 10	 and	 QWORD PTR [rbx+16], rcx
  00049	48 8b cb	 mov	 rcx, rbx
  0004c	e8 00 00 00 00	 call	 ?searchDeBruijn@DeBruijnGenerator@@IEAIXI@Z ;
DeBruijnGenerator::searchDeBruijn
$LN4@searchDeBr:
; Line 652
  00051	8d 56 01	 lea	 edx, DWORD PTR [rsi+1]
  00054	48 8b 04 d7	 mov	 rax, QWORD PTR [rdi+rdx*8]
  00058	48 85 43 08	 test	 rax, QWORD PTR [rbx+8]
  0005c	75 13		 jne	 SHORT $LN3@searchDeBr
; Line 653
  0005e	8b 43 20	 mov	 eax, DWORD PTR [rbx+32]
; Line 654
  00061	48 8b cb	 mov	 rcx, rbx
  00064	48 8b 04 c7	 mov	 rax, QWORD PTR [rdi+rax*8]
  00068	48 09 43 10	 or	 QWORD PTR [rbx+16], rax
  0006c	e8 00 00 00 00	 call	 ?searchDeBruijn@DeBruijnGenerator@@IEAIXI@Z ;
DeBruijnGenerator::searchDeBruijn
$LN3@searchDeBr:
; Line 656
  00071	ff 43 20	 inc	 DWORD PTR [rbx+32]
; Line 657
  00074	48 8b 04 ef	 mov	 rax, QWORD PTR [rdi+rbp*8]
  00078	48 31 43 08	 xor	 QWORD PTR [rbx+8], rax
  0007c	eb 23		 jmp	 SHORT $LN1@searchDeBr
$LN5@searchDeBr:
; Line 658
  0007e	8b 43 1c	 mov	 eax, DWORD PTR [rbx+28]
  00081	8d 4c 12 01	 lea	 ecx, DWORD PTR [rdx+rdx+1]
  00085	48 23 c8	 and	 rcx, rax
  00088	48 8b 04 cf	 mov	 rax, QWORD PTR [rdi+rcx*8]
  0008c	48 85 43 08	 test	 rax, QWORD PTR [rbx+8]
  00090	75 0f		 jne	 SHORT $LN1@searchDeBr
; Line 659
  00092	48 8b 03	 mov	 rax, QWORD PTR [rbx]
  00095	48 8b 53 10	 mov	 rdx, QWORD PTR [rbx+16]
  00099	48 8b cb	 mov	 rcx, rbx
  0009c	ff 10		 call	 QWORD PTR [rax]
; Line 660
  0009e	ff 43 18	 inc	 DWORD PTR [rbx+24]
$LN1@searchDeBr:
; Line 662
  000a1	48 83 c4 28	 add	 rsp, 40	; 00000028H
  000a5	5f		 pop	 rdi
  000a6	5e		 pop	 rsi
  000a7	5d		 pop	 rbp
  000a8	5b		 pop	 rbx
  000a9	c3		 ret	 0
?searchDeBruijn@DeBruijnGenerator@@IEAIXI@Z ENDP	;
DeBruijnGenerator::searchDeBruijn
_TEXT	ENDS




This page took 0 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.