Author: Dann Corbit
Date: 19:15:34 10/13/05
Go up one level in this thread
It processes about 60 MB of PGN per minute on my machine.
You can find a Win32 binary here:
http://cap.connx.com/chess-engines/new-approach/splitbyname.exe
For the curious, here is the source code:
#include <string.h>
#include <stdio.h>
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>
/* The default delimiters are chosen as some ordinary white space characters: */
static const char default_delimiters[] = {' ', '\n', '\t', '\r', '\f', 0};
/*
* The tokenize() function is similar to a reentrant version of strtok().
* It parses tokens from 'string', where tokens are substrings separated by
characters from 'delimiter_list'.
* To get the first token from 'string', tokenize() is called with 'string' as
its first parameter.
* Remaining tokens from 'string' are obtained by calling tokenize() with NULL
for the first parameter.
* The string of delimiters, identified by 'delimiter_list', can change from
call to call.
* If the string of delimiters is NULL, then the standard list
'default_delimiters' (see above) is used.
* tokenize() modifies the memory pointed to by 'string', because it writes null
characters into the buffer.
*/
char *tokenize(char *string, const char *delimiter_list, char
**placeholder)
{
if (delimiter_list == NULL)
delimiter_list = default_delimiters;
if (delimiter_list[0] == 0)
delimiter_list = default_delimiters;
if (string == NULL)
string = *placeholder;
if (string == NULL)
return NULL;
/*
* The strspn() function computes the length of the initial segment of the first
string
* that consists entirely of characters contained in the second string.
*/
string += strspn(string, delimiter_list);
if (!string[0]) {
*placeholder = string;
return NULL;
} else {
char *token;
token = string;
/*
* The strpbrk() function finds the first occurrence of any character contained
in the second string
* found in the first string.
*/
string = strpbrk(token, delimiter_list);
if (string == NULL)
*placeholder = token + strlen(token);
else {
*string++ = 0;
*placeholder = string;
}
return token;
}
}
char whitespace[UCHAR_MAX + 1];
/* This test will create token separators as any whitespace or any punctuation
marks: */
void init_whitespace()
{
int i;
int index = 0;
for (i = 1; i < UCHAR_MAX; i++) {
if (!isalpha(i)) {
whitespace[index++] = (char) i;
}
}
}
char cat_name[32767];
void collapse(char *test_string)
{
char *p = NULL;
char *token;
if (*test_string == '?') {
strcpy(cat_name, "unknown.pgn");
} else if (*test_string == '*') {
strcpy(cat_name, "unknown.pgn");
} else if (*test_string == 0) {
strcpy(cat_name, "unknown.pgn");
} else {
strcpy(cat_name, "");
token = tokenize(test_string, whitespace, &p);
while (token) {
strcat(cat_name, token);
token = tokenize(NULL, whitespace, &p);
if (token)
strcat(cat_name, "_");
}
strcat(cat_name, ".pgn");
}
}
typedef enum game_state {
in_header, in_body
} game_state;
static char string[32767]; /* 32K should hold any reasonable PGN chess
* game line */
static char pgn[32767]; /* 32K should hold any reasonable PGN chess
* game line */
static char game[10000000]; /* 10 MB should hold any reasonable single
* chess game */
static char white[32767];
static char black[32767];
static const char *tagWhite = "[White";
static const char *tagBlack = "[Black";
int main(void)
{
FILE *nextWhite;
FILE *nextBlack;
char *p;
size_t length_remaining;
game_state state = in_header;
int do_flush = 1;
init_whitespace();
while (fgets(string, sizeof string, stdin)) {
char *token;
strcpy(pgn, string);
token = tokenize(pgn, NULL, &p);
if (token) {
if (token[0] == '[') {
if (state != in_header) {
fputs(game, nextWhite);
fclose(nextWhite);
fputs(game, nextBlack);
fclose(nextBlack);
state = in_header;
strcpy(game, "");
do_flush = 0;
}
if (_stricmp(token, tagWhite) == 0) {
token = tokenize(NULL, "]\"", &p);
if (token) {
collapse(token);
nextWhite = fopen(cat_name, "at");
if (nextWhite == NULL) {
printf("Fatal Error: Unable to open file %s\n",
cat_name);
perror("Unable to open for append mode\n");
exit(EXIT_FAILURE);
} else
do_flush = 1;
}
}
if (_stricmp(token, tagBlack) == 0) {
token = tokenize(NULL, "]\"", &p);
if (token) {
collapse(token);
nextBlack = fopen(cat_name, "at");
if (nextBlack == NULL) {
printf("Fatal Error: Unable to open file %s\n",
cat_name);
perror("Unable to open for append mode\n");
exit(EXIT_FAILURE);
}
}
}
} else {
state = in_body;
}
}
length_remaining = sizeof game - strlen(game);
if (length_remaining > strlen(string)) {
strncat(game, string, sizeof game - strlen(game));
} else {
puts("Fatal error: Game larger than 10 MB. Exiting.");
exit(EXIT_FAILURE);
}
}
if (do_flush) {
fputs(game, nextWhite);
fclose(nextWhite);
fputs(game, nextBlack);
fclose(nextBlack);
}
return 0;
}
This page took 0.01 seconds to execute
Last modified: Thu, 15 Apr 21 08:11:13 -0700
Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.