Computer Chess Club Archives

Search

Terms
Messages

Subject: here are the changes

Author: Mike Byrne
Date: 18:48:58 07/27/04
change the chess.h section  for NT_i386 to look like this:

#if defined(NT_i386)
#  undef  HAS_64BITS    /* machine has 64-bit integers / operators
*/
#  define HAS_LONGLONG  /* machine has 32-bit/64-bit integers
*/
#  undef  UNIX          /* system is unix-based
*/
#  undef  STDCALL
#  define STDCALL __stdcall
#  ifdef  VC_INLINE_ASM
#    undef  CDECL
#    define CDECL __cdecl
#    define USE_ASSEMBLY
#  endif
#endif

change the the *.c files (boolean and data.c) that have "INLINE_ASM"
to "USE_ASSEMBLY" and then if you have problems with compiling the
NUMA scetion in utility.c - just comment out the NUMA section -- if
you be careful and follow the brackets.

here is the section that has it modified with the NUMA sections
commented out:

==============================================================================

/*
 *******************************************************************************
 *
        *
 *   Windows NUMA support
        *
 *
        *
 *******************************************************************************
 */

#if (defined(_WIN32) || defined(_WIN64)) && defined(SMP)

lock_t ThreadsLock;

static BOOL(WINAPI * pGetNumaHighestNodeNumber) (PULONG);
static BOOL(WINAPI * pGetNumaNodeProcessorMask) (UCHAR, PULONGLONG);
static DWORD(WINAPI * pSetThreadIdealProcessor) (HANDLE, DWORD);

static volatile BOOL fThreadsInitialized = FALSE;
static BOOL fSystemIsNUMA = FALSE;
static ULONGLONG ullProcessorMask[256];
static ULONG ulNumaNodes;
static ULONG ulNumaNode = 0;

// Get NUMA-related information from Windows

#if defined (NUMA) //byrne

static void WinNumaInit(void)
{
  DWORD_PTR dwMask;
  HMODULE hModule;
  ULONG ulCPU, ulNode;
  ULONGLONG ullMask;
  DWORD dwCPU;

  if (!fThreadsInitialized) {
    Lock(ThreadsLock);
    if (!fThreadsInitialized) {
      printf("\nInitializing multiple threads.\n");
      fThreadsInitialized = TRUE;
      hModule = GetModuleHandle("kernel32");
      pGetNumaHighestNodeNumber =
          (void *) GetProcAddress(hModule,
"GetNumaHighestNodeNumber");
      pGetNumaNodeProcessorMask =
          (void *) GetProcAddress(hModule,
"GetNumaNodeProcessorMask");
      pSetThreadIdealProcessor =
          (void *) GetProcAddress(hModule, "SetThreadIdealProcessor");
      if (pGetNumaHighestNodeNumber && pGetNumaNodeProcessorMask &&
          pGetNumaHighestNodeNumber(&ulNumaNodes) && (ulNumaNodes >
0)) {
        fSystemIsNUMA = TRUE;
        if (ulNumaNodes > 255)
          ulNumaNodes = 255;
        printf("System is NUMA. %d nodes reported by Windows\n",
            ulNumaNodes + 1);
        for (ulNode = 0; ulNode <= ulNumaNodes; ulNode++) {
          pGetNumaNodeProcessorMask((UCHAR) ulNode,
&ullProcessorMask[ulNode]);
          printf("Node %d CPUs: ", ulNode);
          ullMask = ullProcessorMask[ulNode];
          if (0 == ullMask)
            fSystemIsNUMA = FALSE;
          else {
            ulCPU = 0;
            do {
              if (ullMask & 1)
                printf("%d ", ulCPU);
              ulCPU++;
              ullMask >>= 1;
            } while (ullMask);
          }
          printf("\n");
        }
// Thread 0 was already started on some CPU. To simplify things
further,
// exchange ullProcessorMask[0] and ullProcessorMask[node for that
CPU],
// so ullProcessorMask[0] would always be node for thread 0
        dwCPU =
            pSetThreadIdealProcessor(GetCurrentThread(),
MAXIMUM_PROCESSORS);
        printf("Current ideal CPU is %u\n", dwCPU);
        pSetThreadIdealProcessor(GetCurrentThread(), dwCPU);
        if ((((DWORD) - 1) != dwCPU) && (MAXIMUM_PROCESSORS != dwCPU)
&&
            !(ullProcessorMask[0] & (1u i64 << dwCPU))) {
          for (ulNode = 1; ulNode <= ulNumaNodes; ulNode++) {
            if (ullProcessorMask[ulNode] & (1u i64 << dwCPU)) {
              printf("Exchanging nodes 0 and %d\n", ulNode);
              ullMask = ullProcessorMask[ulNode];
              ullProcessorMask[ulNode] = ullProcessorMask[0];
              ullProcessorMask[0] = ullMask;
              break;
            }
          }
        }
      } else
        printf("System is SMP, not NUMA.\n");
    }
 Unlock(ThreadsLock);
  }
}
#endif  //byrne
// Start thread. For NUMA system set it affinity.

pthread_t NumaStartThread(void *func, void *args)
{
  HANDLE hThread;
  ULONGLONG ullMask;
//byrne
  /*WinNumaInit();
  if (fSystemIsNUMA) {
    ulNumaNode++;
    if (ulNumaNode > ulNumaNodes)
      ulNumaNode = 0;
    ullMask = ullProcessorMask[ulNumaNode];
    printf("Starting thread on node %d CPU mask %I64d\n", ulNumaNode,
ullMask);
    SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR) ullMask);
    hThread = (HANDLE) _beginthreadex(0, 0, func, args,
CREATE_SUSPENDED, 0);
    SetThreadAffinityMask(hThread, (DWORD_PTR) ullMask);
    ResumeThread(hThread);
    SetThreadAffinityMask(GetCurrentThread(), ullProcessorMask[0]);
  } else*/ //byrne
    hThread = (HANDLE) _beginthreadex(0, 0, func, args, 0, 0);
  return hThread;
}

// Allocate memory for thread #N

void *WinMalloc(size_t cbBytes, int iThread)
{
  HANDLE hThread;
  //DWORD_PTR dwAffinityMask;  byrne
  void *pBytes;
  ULONG ulNode;

  /*WinNumaInit(); byrne
  if (fSystemIsNUMA) {
    ulNode = iThread % (ulNumaNodes + 1);
    hThread = GetCurrentThread();
    dwAffinityMask = SetThreadAffinityMask(hThread,
ullProcessorMask[ulNode]);
    pBytes = VirtualAlloc(NULL, cbBytes, MEM_COMMIT, PAGE_READWRITE);
    memset(pBytes, 0, cbBytes);
    SetThreadAffinityMask(hThread, dwAffinityMask);
    return pBytes;
  } else*/ //byrne
 return malloc(cbBytes);
}

// Allocate interleaved memory

void *WinMallocInterleaved(size_t cbBytes, int cThreads)
{
  char *pBase;
  char *pEnd;
  char *pch;
  HANDLE hThread;
  //DWORD_PTR dwAffinityMask;byrne
  ULONG ulNode;
  SYSTEM_INFO sSysInfo;
  size_t dwStep;
  int iThread;
  DWORD dwPageSize;             // the page size on this computer
  LPVOID lpvResult;

  /*WinNumaInit();//byrne
  if (fSystemIsNUMA && (cThreads > 1)) {
    GetSystemInfo(&sSysInfo);   // populate the system information
structure
    dwPageSize = sSysInfo.dwPageSize;

// Reserve pages in the process's virtual address space.
    pBase = (char *) VirtualAlloc(NULL, cbBytes, MEM_RESERVE,
PAGE_NOACCESS);
    if (pBase == NULL) {
      printf("VirtualAlloc() reserve failed\n");
      exit(0);
    }
// Now walk through memory, committing each page
    hThread = GetCurrentThread();
    dwStep = dwPageSize * cThreads;
    pEnd = pBase + cbBytes;
    for (iThread = 0; iThread < cThreads; iThread++) {
      ulNode = iThread % (ulNumaNodes + 1);
      dwAffinityMask = SetThreadAffinityMask(hThread,
ullProcessorMask[ulNode]);
      for (pch = pBase + iThread * dwPageSize; pch < pEnd; pch +=
dwStep) {
        lpvResult = VirtualAlloc(pch,   // next page to commit
            dwPageSize, // page size, in bytes
            MEM_COMMIT, // allocate a committed page
            PAGE_READWRITE);    // read/write access
        if (lpvResult == NULL)
          ExitProcess(GetLastError());
        memset(lpvResult, 0, dwPageSize);
      }
      SetThreadAffinityMask(hThread, dwAffinityMask);
    }
  } else {*/ //byrne
    pBase = VirtualAlloc(NULL, cbBytes, MEM_COMMIT, PAGE_READWRITE);
    if (pBase == NULL)
      ExitProcess(GetLastError());
    memset(pBase, 0, cbBytes);
  //}byrne
  return (void *) pBase;
}

// Free interleaved memory

void WinFreeInterleaved(void *pMemory, size_t cBytes)
{
  VirtualFree(pMemory, 0, MEM_RELEASE);
}

#endif

/*
 *******************************************************************************
 *
        *
 *   Linux NUMA support
        *
 *
        *
 *******************************************************************************
 */

#if defined(LINUX) && defined(NUMA)

/*
 *******************************************************************************
 *
        *
 *   First, discover if we are on a NUMA box.  If not, the normal SMP
stuff is *
 *   primed and ready to go.  If we are on a NUMA machine, we need to
know (a) *
 *   how many processors (nodes in the case of AMD/Intel) we have on
the       *
 *   machine and (b) how many processors (threads) the user intends to
run.    *
 *   It becomes important for the "smpmt=n" command to be either on
the        *
 *   command-line or in the crafty.rc/.craftyrc files, otherwise we
might not  *
 *   get things initialized optimally in a NUMA environment.
        *
 *
        *
 *******************************************************************************
 */

void NumaInit(void)
{
  int numa_machine, maxNumaNodes;

  if (!fThreadsInitialized) {
    Lock(ThreadsLock);
    if (!fThreadsInitialized) {
      printf("\nInitializing multiple threads.\n");
      fThreadsInitialized = TRUE;
      numa_machine = numa_available();
      if (numa_machine >= 0) {
        maxNumaNodes = numa_max_node();
        printf("System is NUMA. %d nodes reported by Linux\n",
            maxNumaNodes + 1);
      } else {
        Print(4095, "system is not NUMA, skipping NUMA
initialization\n");
        return;
      }

      pGetNumaHighestNodeNumber =
          (void *) GetProcAddress(hModule,
"GetNumaHighestNodeNumber");
      pGetNumaNodeProcessorMask =
          (void *) GetProcAddress(hModule,
"GetNumaNodeProcessorMask");
      pSetThreadIdealProcessor =
          (void *) GetProcAddress(hModule, "SetThreadIdealProcessor");
      if (pGetNumaHighestNodeNumber && pGetNumaNodeProcessorMask &&
          pGetNumaHighestNodeNumber(&ulNumaNodes) && (ulNumaNodes >
0)) {
        fSystemIsNUMA = TRUE;
        if (ulNumaNodes > 255)
          ulNumaNodes = 255;
        printf("System is NUMA. %d nodes reported by Windows\n",
            ulNumaNodes + 1);
        for (ulNode = 0; ulNode <= ulNumaNodes; ulNode++) {
          pGetNumaNodeProcessorMask((UCHAR) ulNode,
&ullProcessorMask[ulNode]);
          printf("Node %d CPUs: ", ulNode);
          ullMask = ullProcessorMask[ulNode];
          if (0 == ullMask)
            fSystemIsNUMA = FALSE;
          else {
            ulCPU = 0;
            do {
              if (ullMask & 1)
                printf("%d ", ulCPU);
              ulCPU++;
              ullMask >>= 1;
            } while (ullMask);
          }
          printf("\n");
        }
// Thread 0 was already started on some CPU. To simplify things
further,
// exchange ullProcessorMask[0] and ullProcessorMask[node for that
CPU],
// so ullProcessorMask[0] would always be node for thread 0
        dwCPU =
            pSetThreadIdealProcessor(GetCurrentThread(),
MAXIMUM_PROCESSORS);
        printf("Current ideal CPU is %u\n", dwCPU);
        pSetThreadIdealProcessor(GetCurrentThread(), dwCPU);
        if ((((DWORD) - 1) != dwCPU) && (MAXIMUM_PROCESSORS != dwCPU)
&&
            !(ullProcessorMask[0] & (1u i64 << dwCPU))) {
          for (ulNode = 1; ulNode <= ulNumaNodes; ulNode++) {
            if (ullProcessorMask[ulNode] & (1u i64 << dwCPU)) {
              printf("Exchanging nodes 0 and %d\n", ulNode);
              ullMask = ullProcessorMask[ulNode];
              ullProcessorMask[ulNode] = ullProcessorMask[0];
              ullProcessorMask[0] = ullMask;
              break;
            }
          }
        }
      } else
        printf("System is SMP, not NUMA.\n");
    }
 Unlock(ThreadsLock);
  }
}

// Start thread. For NUMA system set it affinity.

pthread_t NumaStartThread(void *func, void *args)
{
  HANDLE hThread;
  ULONGLONG ullMask;

/*  WinNumaInit(); //byrne
  if (fSystemIsNUMA) {
    ulNumaNode++;
    if (ulNumaNode > ulNumaNodes)
      ulNumaNode = 0;
    ullMask = ullProcessorMask[ulNumaNode];
    printf("Starting thread on node %d CPU mask %I64d\n", ulNumaNode,
ullMask);
    SetThreadAffinityMask(GetCurrentThread(), (DWORD_PTR) ullMask);
    hThread = (HANDLE) _beginthreadex(0, 0, func, args,
CREATE_SUSPENDED, 0);
    SetThreadAffinityMask(hThread, (DWORD_PTR) ullMask);
    ResumeThread(hThread);
    SetThreadAffinityMask(GetCurrentThread(), ullProcessorMask[0]);
  } else */  //byrne
    hThread = (HANDLE) _beginthreadex(0, 0, func, args, 0, 0);
  return hThread;
}

// Allocate memory for thread #N

void *WinMalloc(size_t cbBytes, int iThread)
{
  HANDLE hThread;
  DWORD_PTR dwAffinityMask;
  void *pBytes;
  ULONG ulNode;

  /*WinNumaInit();//byrne
  if (fSystemIsNUMA) {
    ulNode = iThread % (ulNumaNodes + 1);
    hThread = GetCurrentThread();
    dwAffinityMask = SetThreadAffinityMask(hThread,
ullProcessorMask[ulNode]);
    pBytes = VirtualAlloc(NULL, cbBytes, MEM_COMMIT, PAGE_READWRITE);
    memset(pBytes, 0, cbBytes);
    SetThreadAffinityMask(hThread, dwAffinityMask);
    return pBytes;
  } else*///byrne
 return malloc(cbBytes);
}

// Allocate interleaved memory

void *WinMallocInterleaved(size_t cbBytes, int cThreads)
{
  char *pBase;
  char *pEnd;
  char *pch;
  HANDLE hThread;
  DWORD_PTR dwAffinityMask;
  ULONG ulNode;
  SYSTEM_INFO sSysInfo;
  size_t dwStep;
  int iThread;
  DWORD dwPageSize;             // the page size on this computer
  LPVOID lpvResult;

  WinNumaInit();
  if (fSystemIsNUMA && (cThreads > 1)) {
    GetSystemInfo(&sSysInfo);   // populate the system information
structure
    dwPageSize = sSysInfo.dwPageSize;

// Reserve pages in the process's virtual address space.
    pBase = (char *) VirtualAlloc(NULL, cbBytes, MEM_RESERVE,
PAGE_NOACCESS);
    if (pBase == NULL) {
      printf("VirtualAlloc() reserve failed\n");
      exit(0);
    }
// Now walk through memory, committing each page
    hThread = GetCurrentThread();
    dwStep = dwPageSize * cThreads;
    pEnd = pBase + cbBytes;
    for (iThread = 0; iThread < cThreads; iThread++) {
      ulNode = iThread % (ulNumaNodes + 1);
      dwAffinityMask = SetThreadAffinityMask(hThread,
ullProcessorMask[ulNode]);
      for (pch = pBase + iThread * dwPageSize; pch < pEnd; pch +=
dwStep) {
        lpvResult = VirtualAlloc(pch,   // next page to commit
            dwPageSize, // page size, in bytes
            MEM_COMMIT, // allocate a committed page
            PAGE_READWRITE);    // read/write access
        if (lpvResult == NULL)
          ExitProcess(GetLastError());
        memset(lpvResult, 0, dwPageSize);
      }
      SetThreadAffinityMask(hThread, dwAffinityMask);
    }
  } else {
    pBase = VirtualAlloc(NULL, cbBytes, MEM_COMMIT, PAGE_READWRITE);
    if (pBase == NULL)
      ExitProcess(GetLastError());
    memset(pBase, 0, cbBytes);
  }
 return (void *) pBase;
}

// Free interleaved memory

void WinFreeInterleaved(void *pMemory, size_t cBytes)
{
  VirtualFree(pMemory,  // base address of block
      cBytes,   // bytes of committed pages
      MEM_DECOMMIT | MEM_RELEASE);      // decommit the pages
}

#endif
==========================================================================
Re: here are the changes Peter Skinner 21:06:28 07/27/04
- Re: here are the changes Mike Byrne 05:31:24 07/28/04
This page took 0 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.