Computer Chess Club Archives


Search

Terms

Messages

Subject: Simple reentrant token parser

Author: Dann Corbit

Date: 09:09:14 10/10/05


#include <string.h>
#include <limits.h>
#include <stdlib.h>
#include <ctype.h>

/* The default delimiters are chosen as some ordinary white space characters: */
static const char default_delimiters[] = {' ', '\n', '\t', '\r', '\f', 0};

/*
 * The tokenize() function is similar to a reentrant version of strtok().
 * It parses tokens from 'string', where tokens are substrings separated by
characters from 'delimiter_list'.
 * To get the first token from 'string', tokenize() is called with 'string' as
its first parameter.
 * Remaining tokens from 'string' are obtained by calling tokenize() with NULL
for the first parameter.
 * The string of delimiters, identified by 'delimiter_list', can change from
call to call.
 * If the string of delimiters is NULL, then the standard list
'default_delimiters' (see above) is used.
 * tokenize() modifies the memory pointed to by 'string', because it writes null
characters into the buffer.
 */
char           *tokenize(char *string, const char *delimiter_list, char
**placeholder)
{
    if (delimiter_list == NULL)
        delimiter_list = default_delimiters;

    if (delimiter_list[0] == 0)
        delimiter_list = default_delimiters;

    if (string == NULL)
        string = *placeholder;

    if (string == NULL)
        return NULL;
/*
 * The strspn() function computes the length of the initial segment of the first
string
 * that consists entirely of characters contained in the second string.
 */
    string += strspn(string, delimiter_list);
    if (!string[0]) {
        *placeholder = string;
        return NULL;
    } else {
        char           *token;
        token = string;
/*
 * The strpbrk() function finds the first occurrence of any character contained
in the second string
 * found in the first string.
 */
        string = strpbrk(token, delimiter_list);
        if (string == NULL)
            *placeholder = token + strlen(token);
        else {
            *string++ = 0;
            *placeholder = string;
        }
        return token;
    }
}

#ifdef UNIT_TEST
char            test_string0[] = "This is a test.  This is only a test.  If it
were an actual emergency, you would be dead.";
char            test_string1[] = "This is a also a test.  This is only a test.
If it were an actual emergency, you would be dead. 12345";
char            test_string2[] = "The quick brown fox jumped over the lazy dog's
back 1234567890 times.";
char            test_string3[] = " \t\r\n\fThe quick brown fox jumped over the
lazy dog's back 1234567890 times.";
char            test_string4[] = "This is a test.  This is only a test.  If it
were an actual emergency, you would be dead.";
char            test_string5[] = "This is a also a test.  This is only a test.
If it were an actual emergency, you would be dead. 12345";
char            test_string6[] = "The quick brown fox jumped over the lazy dog's
back 1234567890 times.";
char            test_string7[] = " \t\r\n\fThe quick brown fox jumped over the
lazy dog's back 1234567890 times.";

#include <stdio.h>

char            whitespace[UCHAR_MAX + 1];

/* This test will create token separators as any whitespace or any punctuation
marks: */
void            init_whitespace()
{
    int             i;
    int             index = 0;
    for (i = 0; i < UCHAR_MAX; i++) {
        if (isspace(i)) {
            whitespace[index++] = (char) i;
        }
        if (ispunct(i)) {
            whitespace[index++] = (char) i;
        }
    }
}

void            spin_test(char *test_string, char *white)
{
    char           *p = NULL;
    char           *token;
    token = tokenize(test_string, white, &p);
    if (token)
        puts(token);

    while (token) {
        token = tokenize(NULL, white, &p);
        if (token) puts(token);
    }

}
int             main(void)
{
	init_whitespace();
    puts("Whitespace is whitespace+punctuation");
    spin_test(test_string0, whitespace);
    spin_test(test_string1, whitespace);
    spin_test(test_string2, whitespace);
    spin_test(test_string3, whitespace);
    puts("Whitespace is simple whitespace");
    spin_test(test_string4, NULL);
    spin_test(test_string5, NULL);
    spin_test(test_string6, NULL);
    spin_test(test_string7, NULL);
    return 0;
}
#endif



This page took 0 seconds to execute

Last modified: Thu, 15 Apr 21 08:11:13 -0700

Current Computer Chess Club Forums at Talkchess. This site by Sean Mintz.