Computer Chess Club Archives


Search

Terms

Messages

Subject: I made one...

Author: Dann Corbit

Date: 19:15:34 10/13/05

Go up one level in this thread


It processes about 60 MB of PGN per minute on my machine.

You can find a Win32 binary here:
http://cap.connx.com/chess-engines/new-approach/splitbyname.exe

For the curious, here is the source code:

#include <string.h>
#include <stdio.h>
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>

/* The default delimiters are chosen as some ordinary white space characters: */
static const char default_delimiters[] = {' ', '\n', '\t', '\r', '\f', 0};

/*
 * The tokenize() function is similar to a reentrant version of strtok().
 * It parses tokens from 'string', where tokens are substrings separated by
characters from 'delimiter_list'.
 * To get the first token from 'string', tokenize() is called with 'string' as
its first parameter.
 * Remaining tokens from 'string' are obtained by calling tokenize() with NULL
for the first parameter.
 * The string of delimiters, identified by 'delimiter_list', can change from
call to call.
 * If the string of delimiters is NULL, then the standard list
'default_delimiters' (see above) is used.
 * tokenize() modifies the memory pointed to by 'string', because it writes null
characters into the buffer.
 */
char           *tokenize(char *string, const char *delimiter_list, char
**placeholder)
{
    if (delimiter_list == NULL)
        delimiter_list = default_delimiters;

    if (delimiter_list[0] == 0)
        delimiter_list = default_delimiters;

    if (string == NULL)
        string = *placeholder;

    if (string == NULL)
        return NULL;
/*
 * The strspn() function computes the length of the initial segment of the first
string
 * that consists entirely of characters contained in the second string.
 */
    string += strspn(string, delimiter_list);
    if (!string[0]) {
        *placeholder = string;
        return NULL;
    } else {
        char           *token;
        token = string;
/*
 * The strpbrk() function finds the first occurrence of any character contained
in the second string
 * found in the first string.
 */
        string = strpbrk(token, delimiter_list);
        if (string == NULL)
            *placeholder = token + strlen(token);
        else {
            *string++ = 0;
            *placeholder = string;
        }
        return token;
    }
}


char            whitespace[UCHAR_MAX + 1];

/* This test will create token separators as any whitespace or any punctuation
marks: */
void            init_whitespace()
{
    int             i;
    int             index = 0;
    for (i = 1; i < UCHAR_MAX; i++) {
        if (!isalpha(i)) {
            whitespace[index++] = (char) i;
        }
    }
}

char            cat_name[32767];

void            collapse(char *test_string)
{
    char           *p = NULL;
    char           *token;
    if (*test_string == '?') {
        strcpy(cat_name, "unknown.pgn");
    } else if (*test_string == '*') {
        strcpy(cat_name, "unknown.pgn");
    } else if (*test_string == 0) {
        strcpy(cat_name, "unknown.pgn");
    } else {
        strcpy(cat_name, "");
        token = tokenize(test_string, whitespace, &p);
        while (token) {
            strcat(cat_name, token);
            token = tokenize(NULL, whitespace, &p);
            if (token)
                strcat(cat_name, "_");
        }
        strcat(cat_name, ".pgn");
    }
}


typedef enum game_state {
    in_header, in_body
}               game_state;

static char     string[32767];  /* 32K should hold any reasonable PGN chess
                                 * game line */
static char     pgn[32767];     /* 32K should hold any reasonable PGN chess
                                 * game line */
static char     game[10000000]; /* 10 MB should hold any reasonable single
                                 * chess game */
static char     white[32767];
static char     black[32767];
static const char *tagWhite = "[White";
static const char *tagBlack = "[Black";

int             main(void)
{
    FILE           *nextWhite;
    FILE           *nextBlack;
    char           *p;
    size_t          length_remaining;
    game_state      state = in_header;
    int             do_flush = 1;
    init_whitespace();
    while (fgets(string, sizeof string, stdin)) {
        char           *token;
        strcpy(pgn, string);
        token = tokenize(pgn, NULL, &p);
        if (token) {
            if (token[0] == '[') {
                if (state != in_header) {
                    fputs(game, nextWhite);
                    fclose(nextWhite);
                    fputs(game, nextBlack);
                    fclose(nextBlack);
                    state = in_header;
                    strcpy(game, "");
                    do_flush = 0;
                }
                if (_stricmp(token, tagWhite) == 0) {
                    token = tokenize(NULL, "]\"", &p);
                    if (token) {
                        collapse(token);
                        nextWhite = fopen(cat_name, "at");
                        if (nextWhite == NULL) {
                            printf("Fatal Error: Unable to open file %s\n",
cat_name);
                            perror("Unable to open for append mode\n");
                            exit(EXIT_FAILURE);
                        } else
                            do_flush = 1;
                    }
                }
                if (_stricmp(token, tagBlack) == 0) {
                    token = tokenize(NULL, "]\"", &p);
                    if (token) {
                        collapse(token);
                        nextBlack = fopen(cat_name, "at");
                        if (nextBlack == NULL) {
                            printf("Fatal Error: Unable to open file %s\n",
cat_name);
                            perror("Unable to open for append mode\n");
                            exit(EXIT_FAILURE);
                        }
                    }
                }
            } else {
                state = in_body;
            }
        }
        length_remaining = sizeof game - strlen(game);
        if (length_remaining > strlen(string)) {
            strncat(game, string, sizeof game - strlen(game));
        } else {
            puts("Fatal error: Game larger than 10 MB.  Exiting.");
            exit(EXIT_FAILURE);
        }
    }
    if (do_flush) {
        fputs(game, nextWhite);
        fclose(nextWhite);
        fputs(game, nextBlack);
        fclose(nextBlack);
    }
    return 0;
}



This page took 0.01 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.