Demo

Tokenizer

Copyright

Copyright 2020-2022 Daniel Robert Bradley
This code is distributed under the terms of the LGPL v2.1
See: https://www.gnu.org/licenses/old-licenses/lgpl-2.1.en.html

Example Usage

#include <stdlib.h>
#include <stdio.h>

#include "libtokenizer/Runtime.h"
#include "libtokenizer/Tokenizer.h"
#include "libtokenizer/Term.h"

void printToken( Token* self, void* stream );

int main( int argc, char** argv )
{
    const char* filepath = "./test/Sample.txt";

    PushbackReader* p = PushbackReader_new( filepath );
    Tokenizer*      t = Tokenizer_new( &p );

    while ( Tokenizer_hasMoreTokens( t ) )
    {
        Token* token = Tokenizer_nextToken( t );
        {
            printToken( token, stdout );
        }
        Token_free( token );
    }

    Tokenizer_free( &t );

    if ( Runtime_Allocated() )
    {
        fprintf( stderr, "Memory leak: %i\n", Runtime_Allocated() );
    }
}

void printToken( Token* self, void* stream )
{
    switch ( self->group->groupType )
    {
    case OPEN:
    case CLOSE:
    case SYMBOLIC:
        switch( self->type )
        {
        case COMMENT:
        case LINECOMMENT:
            Term_Colour( stream, COLOR_COMMENT );
            break;

        default:
            Term_Colour( stream, COLOR_BOLD );
        }
        break;

    case STRING:
        Term_Colour( stream, COLOR_STRING );
        break;

    case CHAR:
        Term_Colour( stream, COLOR_CHAR );
        break;

    case ALPHANUMERIC:
        switch ( self->type )
        {
        case PRIMITIVE:
            Term_Colour( stream, COLOR_TYPE );
            break;

        case CLASS:
        case KEYWORD:
        case MODIFIER:
            Term_Colour( stream, COLOR_MODIFIER );
            break;

        case WORD:
            Term_Colour( stream, COLOR_NORMAL );
            break;

        default:
            Term_Colour( stream, COLOR_LIGHT );
        }
        break;

    case VALUE:
        Term_Colour( stream, COLOR_VALUE );
        break;        

    case UNKNOWN_GROUP:
        Term_Colour( stream, COLOR_UNKNOWN );
        break;        

    default:
        Term_Colour( stream, COLOR_NORMAL );
    }
    fprintf( stream, "%s", self->content );
    Term_Colour( stream, COLOR_NORMAL );
}
import { Tokenizer, PushbackReader } from '/resources/lib/js/libtokenizer.js';

document.getElementById( "textarea-input" ).oninput
=
function( event )
{
    var textarea  = event.target;
    var reader    = new PushbackReader( textarea.value );
    var tokenizer = new Tokenizer( reader );
    var output    = document.getElementById( "pre-output" );
        output.innerHTML = "";

    while ( tokenizer.hasMoreTokens() )
    {
        var token = tokenizer.nextToken();

        if ( "" != token.trim() )
        {
            var span = document.createElement( "SPAN" );
                span.innerHTML = token;

            output.appendChild( span );
        }
    }

    return false;
}
const fs           = require( 'fs' );
const libtokenizer = require( '/Users/daniel/Documents/Dropbox/Dropspace-Sites/_CA/com.libtokenizer/_gen/lib/js/libtokenizer' );

function main()
{
    var content = fs.readFile( './test/Sample.txt', "ascii", mainHandler );
}

function mainHandler( error, content )
{
    if ( error )
    {
        console.error( error );
        return;
    }
    else
    {
        console.log( content );

        if ( true )
        {
            var reader    = new libtokenizer.PushbackReader( content );
            var tokenizer = new libtokenizer.Tokenizer     ( reader  );

            while( tokenizer.hasMoreTokens() )
            {
                token = tokenizer.nextToken();

                console.log( token );
            }
        }
    }
}

main();

Class Definitions

public class
{
    @reader : PushbackReader*
    @queue  : Queue<Token*>

    public method new( reader: PushbackReader* ) : PushbackReader*
    public method nextToken()                    : Token*
    public method hasMoreTokens()                : boolean
}
#ifndef LIBTOKENIZER_TOKENIZER_H
#define LIBTOKENIZER_TOKENIZER_H

#include "libtokenizer/Base.h"
#include "libtokenizer/PushbackReader.h"
#include "libtokenizer/Queue.h"
#include "libtokenizer/Token.h"
#include "libtokenizer/TokenGroup.h"

typedef struct _Tokenizer
{
    PushbackReader* reader;
    Queue*          queue;

} Tokenizer;

Tokenizer* Tokenizer_new          ( PushbackReader** reader );
Tokenizer* Tokenizer_free         ( Tokenizer**      self   );
Token*     Tokenizer_nextToken    ( Tokenizer*       self   );
bool       Tokenizer_hasMoreTokens( Tokenizer*       self   ); 

#endif
export function Tokenizer( reader )
{
    this.reader = reader;
    this.queue  = new Queue();

    this.primeQueue();
}
import java.io.*;

public class Tokenizer {

InputStream input = null;
Token       token = null;

Constructor

public new( reader : PushbackReader )
{
    @reader = reader
    @queue  = new Queue<Token>()

    primeQueue();
}
Tokenizer* Tokenizer_new( PushbackReader** reader )
{
    Tokenizer* self = Runtime_Calloc( 1, sizeof( Tokenizer ) );

    if ( self )
    {   
        self->reader = *reader; (*reader) = NULL;
        self->queue  = Queue_new();

        primeQueue( self );
    }
    return self;
}
export function Tokenizer( reader )
{
    this.reader = reader;
    this.queue  = new Queue();

    this.primeQueue();
}

Deconstructor

The Ix method isn't technically required, but is provided here as an example. Typically, an Ix destructor would be used if resources need to be freed.

public delete()
{
    while ( (var tmp: Token* = @queue.removeHead()) )
    {
        delete tmp
    }
}
Tokenizer* Tokenizer_free( Tokenizer** self )
{
    if ( *self )
    {
        if ( 1 )
        {
            Token* tmp;

            while ( (tmp = Queue_removeHead( (*self)->queue )) )
            {
                Token_free( tmp );
            }
        }

        (*self)->reader = PushbackReader_free( &(*self)->reader );

        if ( (*self)->queue ) (*self)->queue = Queue_free( (*self)->queue );

        *self = Runtime_Free( (*self) );
    }
    return *self;
}

Tokenizer.nextToken

'nextToken' returns the next available token; otherwise null/undefined.

In reality, 'primeQueue' is called to supply the token queue with another token, then the head token of the queue is returned, if available.

public nextToken()
{
    primeQueue();

    if ( @queue.length > 0 )
    {
        return @queue.removeFirst()
    }
    else
    {
        return null;
    }
}
Token* Tokenizer_nextToken( Tokenizer* self )
{
    primeQueue( self );

    if ( Queue_getLength( self->queue ) > 0 )
    {
        return (Token*) Queue_removeHead( self->queue );
    }
    else
    {
        return NULL;
    }
}
Tokenizer.prototype.nextToken
=
function()
{
    this.primeQueue();

    if (this.queue.getLength() > 0 )
    {
        return this.queue.removeHead();
    }
    else
    {
        return undefined;
    }
}

Tokenizer.hasMoreTokens

Returns true if there are more tokens available; otherwise false.

As both the constructor and 'nextToken' call 'primeQueue' to ensure that the 'queue' always has a token to return, if available; there are no more tokens left if the queue is found to be empty.

public hasMoreTokens() : boolean
{
    return (@queue.getLength() > 0);
}
bool Tokenizer_hasMoreTokens( Tokenizer* self )
{
    return (Queue_getLength( self->queue ) > 0);
}
Tokenizer.prototype.hasMoreTokens
=
function()
{
    return (this.queue.getLength() > 0);
}

Tokenizer.primeQueue

The 'primeQueue' method simply calls 'next' to retrieve the next token, then adds it to the end of the queue if one is able to be retrieved.

private primeQueue()
{
    if ( var token = next() )
    {
        @queue.addTail( token )
    }
}
static void primeQueue( Tokenizer* self )
{
    Token* token = NULL;

    if ( (token = next( self )) )
    {
        Queue_addTail( self->queue, token );
    }
}
Tokenizer.prototype.primeQueue
=
function()
{
    var token;

    if ( (token = this.next()) )
    {
        this.queue.addTail( token );
    }
}

Tokenizer.next

The 'next' method reads characters from the PushbackReader - 'reader' - and appends them to an initially empty string buffer. The method determines the token group using the first character, and then calls the TokenGroup.matches method for each additional character, which decides when the character can be appended to the previous characters to form a token.

The method must also handle the following special cases:

  1. When the first character indicates an escape code (\).
  2. When the first character indicates the start of a string (").
  3. When the first character indicates the start of a character constant (').

Usually, the last action the 'next' method will do is decide that the character most recently read does not belong in the current token and will push it back into the reader. For characters and strings, however, they will read the terminating (") or (') and will exit the loop.

private next() : string
{
    var token: Token*

    if ( var ch = @reader.read() )
    {
        var sb    = new StringBuffer()
        var group = new TokenGroup( ch )

        sb.append( ch )

        while ( var ch2 = @reader.read() )
        {
            if ( EnumGroupType.ESCAPE == group.groupType )
            {
                sb  = sb.append( ch2 )
                ch2 = @reader.read()
                break;
            }
            else
            if ( group.matches( ch2 ) )
            {
                if ( '\\' == ch2 )
                {
                    sb  = sb.append( ch2 )
                    ch2 = @reader.read()
                    sb  = sb.append( ch2 )
                }
                else
                {
                    sb = sb.append( ch2 )
                }
            }
            else
            if ( EnumGroupType.STRING == group.groupType )
            {
                sb = sb.append( ch2 )
                c2 = @reader.read()
                break
            }
            else
            if ( EnumGroupType.CHAR == group.groupType )
            {
                sb = sb.append( ch2 )
                c2 = @reader.read()
                break
            }
            else
            {
                break
            }
        }

        if ( c2 )
        {
            @reader.pushback()
        }

        if ( !sb.isEmpty()  )
        {
            token = new Token( this, sb.getContent(), group );
        }
    }
    return token;
}
static Token* next( Tokenizer* self )
{
    Token* token = NULL;
    int    ch    = 0;
    int    ch2   = 0;

    if ( (ch = PushbackReader_read( self->reader )) )
    {
        StringBuffer*  sb = StringBuffer_new();
        TokenGroup* group = TokenGroup_new( ch );

        sb = StringBuffer_append_char( sb, ch );

        while ( (ch2 = PushbackReader_read( self->reader )) )
        {
            if ( ESCAPE == group->groupType )
            {
                sb  = StringBuffer_append_char( sb, ch2 );
                ch2 = PushbackReader_read( self->reader );
                break;
            }
            else
            if ( TokenGroup_matches( group, ch2 ) )
            {
                if ( '\\' == ch2 )
                {
                    sb  = StringBuffer_append_char( sb, ch2 );
                    ch2 = PushbackReader_read( self->reader );
                    sb  = StringBuffer_append_char( sb, ch2 );
                }
                else
                {
                    sb  = StringBuffer_append_char( sb, ch2 );
                }
            }
            else
            if ( STRING == group->groupType )
            {
                sb = StringBuffer_append_char( sb, ch2 );
                ch2 = PushbackReader_read( self->reader );
                break;
            }
            else
            if ( CHAR == group->groupType )
            {
                sb = StringBuffer_append_char( sb, ch2 );
                ch2 = PushbackReader_read( self->reader );
                break;
            }
            else
            {
                break;
            }
        }

        if ( ch2 )
        {
            PushbackReader_pushback( self->reader );
        }

        if ( !StringBuffer_isEmpty( sb ) )
        {
            token = Token_new( self, sb->content, group );
        }

        StringBuffer_free( sb );
        TokenGroup_free( group );
    }
    return token;
}
Tokenizer.prototype.next
=
function()
{
    var token = "";
    var ch;
    var ch2;

    if ( (ch = this.reader.read()) )
    {
        var group = new TokenGroup( ch );

        token = token + ch;

        while ( (ch2 = this.reader.read()) )
        {
            if ( group.matches( ch2 ) )
            {
                token = token + ch2;
            }
            else
            if ( "STRING" == group.groupType )
            {
                token = token + ch2;
                this.reader.read();
                break;
            }
            else
            if ( "CHAR" == group.groupType )
            {
                token = token + ch2;
                this.reader.read();
                break;
            }
            else
            {
                break;
            }
        }
        this.reader.pushback();
    }
    return ("" == token) ? undefined : token;
}

Token

Example

#include <stdio.h>
#include "libtokenizer/Token.h"
#include "libtokenizer/TokenGroup.h"

int main( int argc, char** argv )
{
    Token* token = Token_new( 0, "String", TokenGroup_new( 'A' ) );

    fprintf( stdout, "%s\n", Token_getContent   ( token ) );
    //fprintf( stdout, "%i\n", Token_getTokenGroup( token ) );
    fprintf( stdout, "%i\n", Token_getTokenType ( token ) );

    Token_free( token );
}

Class Definitions

public class
{
    @t       : Tokenizer&;
    @content : string*;
    @length  : integer;
    @group   : TokenGroup;
    @type    : EnumTokenType;
}
#ifndef LIBTOKENIZER_TOKEN_H
#define LIBTOKENIZER_TOKEN_H

#include "TokenGroup.h"
#include "EnumTokenType.h"

typedef struct _Tokenizer Tokenizer;

typedef struct _Token
{
    Tokenizer*    t;
    char*         content;
    int           length;
    TokenGroup*   group;
    EnumTokenType type;

} Token;

Token*        Token_new                      ( Tokenizer* t, const char* content, TokenGroup* aGroup );
Token*        Token_free                     ( Token* this );
const char*   Token_getContent               ( Token* this );
TokenGroup*   Token_getTokenGroup            ( Token* this );
EnumTokenType Token_getTokenType             ( Token* this );
void          Token_print                    ( Token* this, void* stream );

#endif
function Token( t, content, length, aGroup )
{
    this.t       = t;
    this.content = content;
    this.length  = content.length;
    this.group   = aGroup;
    this.type    = Token.DetermineTokenType( aGroup, content );
}
public class Token {

Tokenizer     t;
String        content;
int           length;
TokenGroup    group;
EnumTokenType type;

Constructor

public new( t: Tokenizer&, content: string&, aGroup: TokenGroup )
{
    @t       = t;
    @content = content.clone();
    @length  = content.length;
    @group   = aGroup;
    @type    = DetermineTokenType( aGroup, @content );
}
#include <stdlib.h>
#include <stdio.h>
#include "libtokenizer/Runtime.h"
#include "libtokenizer/String.h"
#include "libtokenizer/Term.h"
#include "libtokenizer/Token.h"

EnumTokenType   Token_DetermineTokenType       ( TokenGroup* group, const char* content );
EnumTokenType   Token_DetermineWhitespaceType  ( const char* content );
EnumTokenType   Token_DetermineSymbolicType    ( const char* content );
EnumTokenType   Token_DetermineAlphanumericType( const char* content );
EnumTokenType   Token_DetermineOpenType        ( const char* content );
EnumTokenType   Token_DetermineCloseType       ( const char* content );

Token* Token_new( Tokenizer* t, const char* content, TokenGroup* aGroup )
{
    Token* self = Runtime_Calloc( 1, sizeof(Token) );

    if ( self )
    {
        self->t       = t;
        self->content = StringCopy  ( content );
        self->length  = StringLength( content );
        self->group   = TokenGroup_copy( aGroup );
        self->type    = Token_DetermineTokenType( aGroup, content );
    }
    return self;
}
function Token( t, content, length, aGroup )
{
    this.t       = t;
    this.content = content;
    this.length  = content.length;
    this.group   = aGroup;
    this.type    = Token.DetermineTokenType( aGroup, content );
}
public Token( Tokenizer t, String content, TokenGroup aGroup )
{
    this.t       = t;
    this.content = content;
    this.length  = content.length();
    this.group   = aGroup;
    this.type    = DetermineTokenType( aGroup, content );
}

Deconstructor

Ix does not require explicit deconstructor.

Token* Token_free( Token* self )
{
    if ( self->group ) self->group = TokenGroup_free( self->group );

    free( self->content );

    self->t       = NULL;
    self->content = NULL;

    Runtime_Free( self );

    return NULL;
}

Token.getContent

public getContent() : const string&
{
    return @content;
}
const char* Token_getContent( Token* this )
{
    return this->content;
}
Token.prototype.getContent
=
function()
{
    return this.content;
}

Token.getLength

public getLength() : integer
{
    return @length;
}
int Token_getLength( Token* this )
{
    return this->length;
}
Token.prototype.getLength
=
function()
{
    return this.length;
}

Token.getTokenGroup

public getTokenGroup : TokenGroup
{
    return @group;
}
TokenGroup* Token_getTokenGroup( Token* this )
{
    return this->group;
}
Token.prototype.getTokenGroup
=
function()
{
    return this.group;
}

Token.getTokenType

public getTokenType : TokenType
{
    return @type;
}
EnumTokenType Token_getTokenType( Token* this )
{
    return this->type;
}
Token.prototype.getTokenType
=
function()
{
    return this.type;
}

Token.DetermineTokenType

private DetermineTokenType( group: TokenGroup, content: string& )
{
    var type = TokenType.UNKNOWN;

    switch( group )
    {
    case WHITESPACE:
        type = Token.DetermineWhitespaceType( content );
        break;

    case SYMBOLIC:
        type = Token.DetermineSymbolicType( content );
        break;

    case ALPHANUMERIC:
        type = Token.DetermineAlhanumericType( content );
        break;

    case VALUE:
        type = TokenType.VALUE;
        break;

    case HEX_VALUE:
        type = TokenType.HEX;
        break;

    case OPEN:
        type = TokenType.UNKNOWN_TYPE;
        break;

    case CLOSE:
        type = TokenType.UNKNOWN_TYPE;
        break;

    case UNKNOWN_GROUP:
        type = TokenType.UNKNOWN_TYPE;
        break;
    }

    return type;
}
EnumTokenType Token_DetermineTokenType( TokenGroup* group, const char* content )
{
    EnumTokenType type = UNKNOWN_TYPE;

    switch ( group->groupType )
    {
    case UNKNOWN_GROUP:
        type = UNKNOWN_TYPE;
        break;

    case WHITESPACE:
        type = Token_DetermineWhitespaceType( content );
        break;

    case OPEN:
        type = Token_DetermineOpenType( content );
        break;

    case CLOSE:
        type = Token_DetermineCloseType( content );
        break;

    case SYMBOLIC:
        type = Token_DetermineSymbolicType( content );
        break;

    case ALPHANUMERIC:
        type = Token_DetermineAlphanumericType( content );
        break;

    case STRING:
        type = UNKNOWN_TYPE;
        break;

    case CHAR:
        type = FLOAT;
        break;

    case VALUE:
        type = FLOAT;
        break;

    case HEX_VALUE:
        type = HEX;
        break;

    default:
        type = UNKNOWN_TYPE;
    }

    return type;
}
Token.DetermineTokenType
=
function( group, content )
{
    var type = TokenType.UNKNOWN;

    switch( group )
    {
    case TokenGroup.WHITESPACE:
        type = Token.DetermineWhitespaceType( content );
        break;



    case TokenGroup.SYMBOLIC:
        type = Token.DetermineSymbolicType( content );
        break;

    case TokenGroup.ALPHANUMERIC:
        type = Token.DetermineAlhanumericType( content );
        break;

    case TokenGroup.VALUE:
        type = TokenType.VALUE;
        break;

    case TokenGroup.HEX_VALUE:
        type = TokenType.HEX;
        break;
    }
}

Token.DetermineWhitespaceType

EnumTokenType Token_DetermineWhitespaceType( const char* content )
{
    switch( content[0] )
    {
    case ' ':
        return SPACE;
    case '\t':
        return TAB;
    case '\n':
        return NEWLINE;
    default:
        return UNKNOWN_WHITESPACE;
    }
}

Token.DetermineOpenType

EnumTokenType Token_DetermineOpenType( const char* content )
{
    switch ( content[0] )
    {
    case '{':
        return STARTBLOCK;
    case '(':
        return STARTEXPRESSION;
    case '[':
        return STARTSUBSCRIPT;
    case '<':
        return STARTTAG;
    default:
        return UNKNOWN_OPEN;
    }
}

Token.DetermineCloseType

EnumTokenType Token_DetermineCloseType( const char* content )
{
    switch ( content[0] )
    {
    case '}':
        return ENDBLOCK;
    case ')':
        return ENDEXPRESSION;
    case ']':
        return ENDSUBSCRIPT;
    case '>':
        return ENDTAG;
    default:
        return UNKNOWN_OPEN;
    }
}

Token.DetermineSymbolicType

EnumTokenType Token_DetermineSymbolicType( const char* content )
{
    switch ( content[0] )
    {
    case '~':   return SYMBOL;
    case '!':
        switch ( content[1] )
        {
        case '=':  return INFIXOP;
        default:   return PREFIXOP;
        }
        break;

    case '@':   return SYMBOL;
    case '#':   return SYMBOL;
    case '$':   return SYMBOL;
    case '%':
        switch ( content[1] )
        {
        case '=':  return ASSIGNMENTOP;
        default:   return INFIXOP;
        }
        break;

    case '^':
        switch ( content[1] )
        {
        case '=':  return ASSIGNMENTOP;
        default:   return INFIXOP;
        }
        break;

    case '&':
        switch ( content[1] )
        {
        case '&':  return INFIXOP;
        case '=':  return ASSIGNMENTOP;
        default:   return INFIXOP;
        }
        break;

    case '*':
        switch ( content[1] )
        {
        case '=':  return ASSIGNMENTOP;
        default:   return INFIXOP;
        }
        break;

    case '-':
        switch ( content[1] )
        {
        case '-':  return PREPOSTFIXOP;
        case '=':  return ASSIGNMENTOP;
        default:   return INFIXOP;
        }
        break;

    case '+':
        switch ( content[1] )
        {
        case '+':  return PREPOSTFIXOP;
        case '=':  return ASSIGNMENTOP;
        default:   return INFIXOP;
        }
        break;

    case '=':
        switch ( content[1] )
        {
        case '=':  return INFIXOP;
        default:   return ASSIGNMENTOP;
        }
        break;

    case '/':
        switch ( content[1] )
        {
        case '/':  return LINECOMMENT;
        case '*':  return COMMENT;
        case '=':  return ASSIGNMENTOP;
        default:   return INFIXOP;
        }
        break;

    case ':':   return OPERATOR;
    case ';':   return STOP;
    case '<':   return INFIXOP;
    case '>':   return INFIXOP;
    default:    return SYMBOL;
    }
}

Token.DetermineAlphanumericType

EnumTokenType Token_DetermineAlphanumericType( const char* content )
{
         if ( StringEquals( content, "class"      ) ) return CLASS;
    else if ( StringEquals( content, "import"     ) ) return IMPORT;
    else if ( StringEquals( content, "include"    ) ) return INCLUDE;
    else if ( StringEquals( content, "interface"  ) ) return INTERFACE;
    else if ( StringEquals( content, "package"    ) ) return PACKAGE;

    else if ( StringEquals( content, "public"     ) ) return MODIFIER;
    else if ( StringEquals( content, "protected"  ) ) return MODIFIER;
    else if ( StringEquals( content, "private"    ) ) return MODIFIER;

    else if ( StringEquals( content, "bool"       ) ) return PRIMITIVE;
    else if ( StringEquals( content, "boolean"    ) ) return PRIMITIVE;
    else if ( StringEquals( content, "byte"       ) ) return PRIMITIVE;
    else if ( StringEquals( content, "char"       ) ) return PRIMITIVE;
    else if ( StringEquals( content, "const"      ) ) return PRIMITIVE;
    else if ( StringEquals( content, "double"     ) ) return PRIMITIVE;
    else if ( StringEquals( content, "float"      ) ) return PRIMITIVE;
    else if ( StringEquals( content, "int"        ) ) return PRIMITIVE;
    else if ( StringEquals( content, "integer"    ) ) return PRIMITIVE;
    else if ( StringEquals( content, "long"       ) ) return PRIMITIVE;
    else if ( StringEquals( content, "short"      ) ) return PRIMITIVE;
    else if ( StringEquals( content, "signed"     ) ) return PRIMITIVE;
    else if ( StringEquals( content, "string"     ) ) return PRIMITIVE;
    else if ( StringEquals( content, "unsigned"   ) ) return PRIMITIVE;
    else if ( StringEquals( content, "void"       ) ) return PRIMITIVE;

    else if ( StringEquals( content, "break"      ) ) return KEYWORD;
    else if ( StringEquals( content, "case"       ) ) return KEYWORD;
    else if ( StringEquals( content, "catch"      ) ) return KEYWORD;
    else if ( StringEquals( content, "default"    ) ) return KEYWORD;
    else if ( StringEquals( content, "extends"    ) ) return KEYWORD;
    else if ( StringEquals( content, "implements" ) ) return KEYWORD;
    else if ( StringEquals( content, "for"        ) ) return KEYWORD;
    else if ( StringEquals( content, "foreach"    ) ) return KEYWORD;
    else if ( StringEquals( content, "let"        ) ) return KEYWORD;
    else if ( StringEquals( content, "namespace"  ) ) return KEYWORD;
    else if ( StringEquals( content, "return"     ) ) return KEYWORD;
    else if ( StringEquals( content, "switch"     ) ) return KEYWORD;
    else if ( StringEquals( content, "try"        ) ) return KEYWORD;
    else if ( StringEquals( content, "var"        ) ) return KEYWORD;
    else                                              return WORD;
}

Token.print

void Token_print( Token* self, void* stream )
{
    switch ( self->group->groupType )
    {
    case OPEN:
    case CLOSE:
    case SYMBOLIC:
        switch( self->type )
        {
        case COMMENT:
        case LINECOMMENT:
            Term_Colour( stream, COLOR_COMMENT );
            break;

        default:
            Term_Colour( stream, COLOR_BOLD );
        }
        break;

    case STRING:
        Term_Colour( stream, COLOR_STRING );
        break;

    case CHAR:
        Term_Colour( stream, COLOR_CHAR );
        break;

    case ALPHANUMERIC:
        switch ( self->type )
        {
        case PRIMITIVE:
            Term_Colour( stream, COLOR_TYPE );
            break;

        case CLASS:
        case KEYWORD:
        case MODIFIER:
            Term_Colour( stream, COLOR_MODIFIER );
            break;

        case WORD:
            Term_Colour( stream, COLOR_NORMAL );
            break;

        default:
            Term_Colour( stream, COLOR_LIGHT );
        }
        break;

    case VALUE:
        Term_Colour( stream, COLOR_VALUE );
        break;        

    case UNKNOWN_GROUP:
        Term_Colour( stream, COLOR_UNKNOWN );
        break;        

    default:
        Term_Colour( stream, COLOR_NORMAL );
    }
    fprintf( stream, "%s", self->content );
    Term_Colour( stream, COLOR_NORMAL );
}

Token Group (Enum)

Enum definitions

public enum EnumTokenGroup
{
    UNKNOWN      : "UNKNOWN"
    WHITESPACE   : "WHITESPACE"     #   
    OPEN         : "OPEN"           # { ( [ <
    CLOSE        : "CLOSE"          # } ) ] >
    SYMBOLIC     : "SYMBOLIC"       # ~!@#$%^&*-
    ESCAPE       : "ESCAPE"         # \\
    ALPHANUMERIC : "ALPHANUMERIC"   # _ A-Z a-z 0-9
    STRING       : "STRING"         # \" 
    CHAR         : "CHAR"           # \' 
    VALUE        : "VALUE"          # 9, 
    HEX_VALUE    : "HEX_VALUE"      # 0x9999
}
#ifndef LIBTOKENIZER_ENUMTOKENGROUP_H
#define LIBTOKENIZER_ENUMTOKENGROUP_H

#include "libtokenizer/Base.h"

typedef enum _EnumTokenGroup
{
    UNKNOWN_GROUP,
    WHITESPACE,
    OPEN,
    CLOSE,
    SYMBOLIC,
    ESCAPE,
    ALPHANUMERIC,
    STRING,
    CHAR,
    VALUE,
    HEX_VALUE

} EnumTokenGroup;

#endif
export var EnumTokenGroup = new Enum([
    "UNKNOWN",
    "WHITESPACE",
    "OPEN",
    "CLOSE",
    "SYMBOLIC",
    "ESCAPE",
    "ALPHANUMERIC",
    "STRING",
    "CHAR",
    "VALUE",
    "HEX_VALUE"
]);
public enum EnumTokenGroup
{
    UNKNOWN,
    WHITESPACE,
    OPEN,
    CLOSE,
    SYMBOLIC,
    ALPHANUMERIC,
    VALUE,
    HEX_VALUE
}

Token type

public enum EnumTokenType
{
    UNKNOWN_TYPE
    WORD            # Alphanumeric
    FILEPATH
    PACKAGE
    IMPORT
    INCLUDE
    CLASS
    CLASSNAME
    INTERFACE
    ENUM
    ENUMNAME
    GENERIC
    ANNOTATION
    IMETHOD
    METHOD
    BLOCK
    STARTBLOCK      # Open
    ENDBLOCK
    MEMBER
    EXPRESSION
    STARTEXPRESSION
    ENDEXPRESSION
    CLAUSE
    PARAMETERS
    PARAMETER
    ARGUMENTS
    ARGUMENT
    STATEMENT
    DECLARATION
    COMMENT
    JAVADOC
    BLANKLINE
    TOKEN
    SYMBOL
    KEYWORD
    MODIFIED
    PRIMITIVE
    TYPE
    METHODNAME
    VARIABLE
    NAME
    METHODCALL
    CONSTRUCTOR
    OPERATOR
    ASSIGNMENTOP
    PREFIXOP
    INFIXOP
    POSTFIXOP
    PREINFIXOP
    PREPOSTFIXOP
    SELECTOR
    VALUE
    FLOAT
    INTEGER
    NUMBER
    HEX
    OCTAL
    DOUBLEQUOTE
    QUOTE
    STOP
    TAB
    SPACE
    WHITESPACE
    NEWLINE
    LINECOMMENT
    ESCAPED
    OTHER
}
export var EnumTokenType = new Enum
([
    "UNKNOWN_TYPE",
    "WORD",
    "FILE",
    "PACKAGE",
    "IMPORT",
    "INCLUDE",
    "CLASS",
    "CLASSNAME",
    "INTERFACE",
    "ENUM",
    "ENUMNAME",
    "srcERIC",
    "ANNOTATION",
    "IMETHOD",
    "METHOD",
    "BLOCK",
    "STARTBLOCK",
    "ENDBLOCK",
    "MEMBER",
    "EXPRESSION",
    "STARTEXPRESSION",
    "ENDEXPRESSION",
    "CLAUSE",
    "PARAMETERS",
    "PARAMETER",
    "ARGUMENTS",
    "ARGUMENT",
    "STATEMENT",
    "DECLARATION",
    "COMMENT",
    "JAVADOC",
    "BLANKLINE",
    "TOKEN",
    "SYMBOL",
    "KEYWORD",
    "MODIFIED",
    "PRIMITIVE",
    "TYPE",
    "METHODNAME",
    "VARIABLE",
    "NAME",
    "METHODCALL",
    "CONSTRUCTOR",
    "OPERATOR",
    "ASSIGNMENTOP",
    "PREFIXOP",
    "INFIXOP",
    "POSTFIXOP",
    "PREINFIXOP",
    "PREPOSTFIXOP",
    "SELECTOR",
    "VALUE",
    "FLOAT",
    "INTEGER",
    "NUMBER",
    "HEX",
    "OCTAL",
    "DOUBLEQUOTE",
    "QUOTE",
    "STOP",
    "TAB",
    "SPACE",
    "WHITESPACE",
    "NEWLINE",
    "LINECOMMENT",
    "ESCAPED",
    "OTHER"
]);
typedef enum _EnumTokenType
{
	UNKNOWN_TYPE,
    UNKNOWN_WHITESPACE,
    UNKNOWN_OPEN,
    UNKNOWN_CLOSE,

    //  Whitespace
    SPACE,
    TAB,
    NEWLINE,

    //  Open
    STARTBLOCK,
    STARTEXPRESSION,
    STARTSUBSCRIPT,
    STARTTAG,

    //  Close
    ENDBLOCK,
    ENDEXPRESSION,
    ENDSUBSCRIPT,
    ENDTAG,

    //  Symbolic
    OPERATOR,
    ASSIGNMENTOP,
    PREFIXOP,
    INFIXOP,
    POSTFIXOP,
    PREINFIXOP,
    PREPOSTFIXOP,
    STOP,
    LINECOMMENT,
    COMMENT,

    //  Words


    //  Composite
	WORD,
	FILEPATH,
	PACKAGE,
	IMPORT,
	INCLUDE,
	CLASS,
	CLASSNAME,
	INTERFACE,
	ENUM,
	ENUMNAME,
	GENERIC,
	ANNOTATION,
	IMETHOD,
	METHOD,
	BLOCK,
	MEMBER,
	MEMBERNAME,
	EXPRESSION,
	CLAUSE,
	PARAMETERS,
	PARAMETER,
	ARGUMENTS,
	ARGUMENT,
	STATEMENT,
	DECLARATION,
	JAVADOC,
	BLANKLINE,
	TOKEN,
	SYMBOL,
	KEYWORD,
	MODIFIER,
	PRIMITIVE,
	TYPE,
	METHODNAME,
	VARIABLE,
	NAME,
	METHODCALL,
	CONSTRUCTOR,
	SELECTOR,
	FLOAT,
	INTEGER,
	NUMBER,
	HEX,
	OCTAL,
	DOUBLEQUOTE,
	QUOTE,
	ESCAPED,
	OTHER
} EnumTokenType;
public enum EnumTokenType
{
    UNKNOWN_TYPE,
    WORD,
    FILE,
    PACKAGE,
    IMPORT,
    INCLUDE,
    CLASS,
    CLASSNAME,
    INTERFACE,
    ENUM,
    ENUMNAME,
    srcERIC,
    ANNOTATION,
    IMETHOD,
    METHOD,
    BLOCK,
    STARTBLOCK,
    ENDBLOCK,
    MEMBER,
    EXPRESSION,
    STARTEXPRESSION,
    ENDEXPRESSION,
    CLAUSE,
    PARAMETERS,
    PARAMETER,
    ARGUMENTS,
    ARGUMENT,
    STATEMENT,
    DECLARATION,
    COMMENT,
    JAVADOC,
    BLANKLINE,
    TOKEN,
    SYMBOL,
    KEYWORD,
    MODIFIED,
    PRIMITIVE,
    TYPE,
    METHODNAME,
    VARIABLE,
    NAME,
    METHODCALL,
    CONSTRUCTOR,
    OPERATOR,
    ASSIGNMENTOP,
    PREFIXOP,
    INFIXOP,
    POSTFIXOP,
    PREINFIXOP,
    PREPOSTFIXOP,
    SELECTOR,
    VALUE,
    FLOAT,
    INTEGER,
    NUMBER,
    HEX,
    OCTAL,
    DOUBLEQUOTE,
    QUOTE,
    STOP,
    TAB,
    SPACE,
    WHITESPACE,
    NEWLINE,
    LINECOMMENT,
    ESCAPED,
    OTHER
}

Token Group

Class Definitions

public class TokenGroup
{
    @character: char
    @groupType: EnumTokenGroup
}
#ifndef LIBTOKENIZER_TOKENGROUP_H
#define LIBTOKENIZER_TOKENGROUP_H

#include "libtokenizer/Base.h"
#include "libtokenizer/EnumTokenGroup.h"

typedef struct _TokenGroup
{
    char           character;
    EnumTokenGroup groupType;

} TokenGroup;

TokenGroup*    TokenGroup_new          ( char ch );
TokenGroup*    TokenGroup_free         ( TokenGroup* self );
EnumTokenGroup TokenGroup_DetermineType( char ch );
bool           TokenGroup_matches      ( TokenGroup* self, char ch );
TokenGroup*    TokenGroup_copy         ( const TokenGroup* self );

#endif

Constructors

public new( character : char )
{
    @character = character
    @groupType = TokenGroup.DetermineType( character );
}
#include "libtokenizer/Base.h"
#include "libtokenizer/Runtime.h"
#include "libtokenizer/TokenGroup.h"

TokenGroup* TokenGroup_new( char character )
{
    TokenGroup* self = Runtime_Calloc( 1, sizeof( TokenGroup ) );

    if ( self )
    {
        self->character = character;
        self->groupType = TokenGroup_DetermineType( character );
    }

    return self;
}
function TokenGroup( character )
{
    this.character = character
    this.groupType = TokenGroup.DetermineType( character );
}

Destructors

TokenGroup* TokenGroup_free( TokenGroup* self )
{
    if ( self )
    {
        self->character = 0;
        self->groupType = 0;
    }
    Runtime_Free( self );

    return 0;
}

TokenGroup.DetermineType

EnumTokenGroup TokenGroup_DetermineType( char ch )
{
    switch ( ch )
    {
    case '~':
    case '!':
    case '@':
    case '#':
    case '$':
    case '%':
    case '^':
    case '&':
    case '*':
    case '-':
    case '+':
    case '=':
    case '|':
    case ':':
    case ';':
    case ',':
    case '.':
    case '?':
    case '/':
        return SYMBOLIC;

    case '\\':
        return ESCAPE;

    case '(':
    case '{':
    case '[':
    case '<':
        return OPEN;

    case ')':
    case '}':
    case ']':
    case '>':
        return CLOSE;

    case '"':
        return STRING;

    case '\'':
        return CHAR;

    case '_':
        return ALPHANUMERIC;

    default:
        switch ( ch )
        {
        case  9: // TAB
        case 10: // LF
        case 11: // VT
        case 12: // FF
        case 13: // CR
        case 14: // SO
        case 15: // SI
        case 32: // SPACE
            return WHITESPACE;

        default:
            if ( (48 <= ch) && (ch <= 57) )
            {
                return VALUE;
            }
            else
            if ( (65 <= ch) && (ch <= 90) ) // uppercase
            {
                return ALPHANUMERIC;
            }
            else
            if ( (97 <= ch) && (ch <= 122) ) // lowercase
            {
                return ALPHANUMERIC;
            }
            return UNKNOWN_GROUP;
        }
    }
}
TokenGroup.DetermineType
=
function( ch )
{
    switch ( ch )
    {
    case '~':
    case '!':
    case '@':
    case '#':
    case '$':
    case '%':
    case '^':
    case '&':
    case '*':
    case '-':
    case '+':
    case '=':
    case '|':
    case ':':
    case ';':
    case '.':
        return EnumTokenGroup.SYMBOLIC;

    case '(':
    case '{':
    case '[':
    case '<':
        return EnumTokenGroup.OPEN;

    case ')':
    case '}':
    case ']':
    case '>':
        return EnumTokenGroup.CLOSE;

    case '"':
        return EnumTokenGroup.STRING;

    case '\'':
        return EnumTokenGroup.CHAR;

    case '_':
        return EnumTokenGroup.ALPHANUMERIC;

    default:
        var char_code = ch.charCodeAt( 0 )

        switch ( char_code )
        {
        case 10: // LF
        case 11: // VT
        case 12: // FF
        case 13: // CR
        case 14: // SO
        case 15: // SI
        case 32: // SPACE
            return EnumTokenGroup.WHITESPACE

        default:
            if ( (48 <= char_code) && (char_code <= 57) )
            {
                return EnumTokenGroup.VALUE;
            }
            else
            if ( (65 <= char_code) && (char_code <= 90) ) // uppercase
            {
                return EnumTokenGroup.ALPHANUMERIC
            }
            else
            if ( (97 <= char_code) && (char_code <= 122) ) // lowercase
            {
                return EnumTokenGroup.ALPHANUMERIC
            }
            return EnumTokenGroup.UNKNOWN;
        }
    }
}

TokenGroup.matches

bool TokenGroup_matches( TokenGroup* self, char ch )
{
    if ( '\0' == ch )
    {
        return FALSE;
    }
    else
    {
        EnumTokenGroup secondType = TokenGroup_DetermineType( ch );

        switch( self->groupType )
        {
        case SYMBOLIC:
            switch( secondType )
            {
            case SYMBOLIC:
                return TRUE;

            default:
                return FALSE;
            }
            break;

        case STRING:
            switch ( secondType )
            {
            case STRING:
                return FALSE;

            default:
                return TRUE;
            }
            break;

        case CHAR:
            switch ( secondType )
            {
            case CHAR:
                return FALSE;

            default:
                return TRUE;
            }
            break;

        case ALPHANUMERIC:
            switch ( secondType )
            {
            case ALPHANUMERIC:
            case VALUE:
                return TRUE;

            default:
                return FALSE;
            }
            break;

        case WHITESPACE:
            switch ( secondType )
            {
            case WHITESPACE:
                return TRUE;

            default:
                return FALSE;
            }
            break;

        case VALUE:
            switch ( secondType )
            {
            case VALUE:
                return TRUE;

            case ALPHANUMERIC:
                if ( (65 <= ch) && (ch <= 70) )
                {
                    return TRUE;
                }
                else
                if ( (97 <= ch) && (ch <= 102) )
                {
                    return TRUE;
                }
                else
                return ('x' == ch);

            default:
                return FALSE;
            }
            break;

        case UNKNOWN_GROUP:
            switch ( secondType )
            {
            case UNKNOWN_GROUP:
                return TRUE;

            default:
                return FALSE;
            }
            break;

        default:
            return FALSE;
        }
    }
}
TokenGroup.prototype.matches
=
function( character )
{
    if ( "" == character )
    {
        return false;
    }
    else
    {
        var secondType = TokenGroup.DetermineType( character );
        var char_code  = character.charCodeAt( 0 );

        switch ( this.groupType )
        {
        case EnumTokenGroup.SYMBOLIC:
            switch ( secondType )
            {
            case EnumTokenGroup.SYMBOLIC:
                return true;

            default:
                return false;
            }
            break;

        case EnumTokenGroup.STRING:
            switch ( secondType )
            {
            case EnumTokenGroup.STRING:
                return false;

            default:
                return true;
            }
            break;

        case EnumTokenGroup.CHAR:
            return false;
            break;

        case EnumTokenGroup.ALPHANUMERIC:
            switch ( secondType )
            {
            case EnumTokenGroup.ALPHANUMERIC:
            case EnumTokenGroup.VALUE:
                return true;

            default:
                return false;
            }
            break;

        case EnumTokenGroup.WHITESPACE:
            switch ( secondType )
            {
            case EnumTokenGroup.WHITESPACE:
                return true;

            default:
                return false;
            }
            break;

        case EnumTokenGroup.VALUE:
            switch ( secondType )
            {
            case EnumTokenGroup.VALUE:
                return true;

            case EnumTokenGroup.ALPHANUMERIC:
                if ( ('65' <= char_code) && (char_code <= 70) )
                {
                    return true;
                }
                else
                if ( ('97' <= char_code) && (char_code <= 102) )
                {
                    return true;
                }
                else
                return ("x" == character);

            default:
                return false;
            }
            break;

        case EnumTokenGroup.UNKNOWN:
            switch ( secondType )
            {
            case EnumTokenGroup.UNKNOWN:
                return true;

            default:
                return false;
            }
            break;

        default:
            return false;
        }
    }
}

TokenGroup.copy

TokenGroup* TokenGroup_copy( const TokenGroup* self )
{
    TokenGroup* copy = Runtime_Calloc( 1, sizeof( TokenGroup ) );

    copy->character = self->character;
    copy->groupType = self->groupType;

    return copy;
}

Array

Example usage: Array

#include <stdlib.h>
#include <stdio.h>
#include "libtokenizer/Array.h"
#include "libtokenizer/Runtime.h"
#include "libtokenizer/String.h"

int main( int argc, char** argv )
{
    Array* array  = Array_new( NULL );
    Array* target = Array_new( NULL );

    int len;

    if ( 1 )
    {
        for ( int i=0; i < 100; i++ )
        {
            char* test = StringCopy( "test" );

            Array_push( array, test );
        }

        len = Array_length( array );

        //fprintf( stdout, "Removing %i items\n", len );
        for ( int i=0; i < len; i++ )
        {
            char* test = (char*) Array_shift( array );

            //fprintf( stdout, "%2i: %s\n", i, test );

            Array_unshift( target, test );
        }
        len = Array_length( array );

        //fprintf( stdout, "%i items left\n", len );

        if ( 0 != len )
        {
            fprintf( stderr, "Unusual circumstance\n" );
            exit( -1 );
        }

        if ( (char*) Array_shift( array ) )
        {
            fprintf( stderr, "Unusual circumstance\n" );
            exit( -1 );
        }
    }
    Array_free( &array );

    if ( 1 )
    {
        len = Array_length( target );
        //fprintf( stdout, "Removing %i items from target\n", len );
        for ( int i=0; i < len; i++ )
        {
            char* test = (char*) Array_shift( target );

            //fprintf( stdout, "%2i: %s\n", i, test );

            free( test );
        }
        len = Array_length( target );

        //fprintf( stdout, "%i items left\n", len );

        if ( 0 != len )
        {
            fprintf( stderr, "Unusual circumstance\n" );
            exit( -1 );
        }

        if ( (char*) Array_shift( target ) )
        {
            fprintf( stderr, "Unusual circumstance\n" );
            exit( -1 );
        }
    }
    Array_free( &target );

    if ( Runtime_Allocated() )
    {
        fprintf( stderr, "Memory leak: %i\n", Runtime_Allocated() );
    }
}

Class Definition

#ifndef LIBTOKENIZER_ARRAY_H
#define LIBTOKENIZER_ARRAY_H

typedef struct _Array
{
    void** objects;
    int    length;
    int    size;

} Array;

Array* Array_new    ( Array*  self );
Array* Array_free   ( Array** self );
Array* Array_push   ( Array*  self, void* object );
void*  Array_shift  ( Array*  self );
Array* Array_unshift( Array*  self, void* object );
int    Array_length ( Array*  self );

#endif

Supporting private functions

#include <stdlib.h>
#include "libtokenizer/Array.h"
#include "libtokenizer/Runtime.h"

void Array_expand( Array* self )
{
    if ( 0 == self->size )
    {
        self->objects = (void**) Runtime_Calloc( 1, sizeof( void* ) );
        self->size    = 1;
    }
    else
    {
        int new_size = self->size * 2;

        void** tmp = (void**) Runtime_Calloc( new_size, sizeof( void* ) );

        for ( int i=0; i < self->length; i++ )
        {
            tmp[i] = self->objects[i];
        }

        Runtime_Free( self->objects );

        self->objects = tmp;
        self->size    = new_size;
    }
}

Constructors

Array* Array_new( Array* self )
{
    if ( ! self ) self = Runtime_Calloc( 1, sizeof( Array ) );

    if ( self )
    {
        self->objects = 0;
        self->length  = 0;
        self->size    = 0;
    }
    return self;
}

Deconstructors

Array* Array_free( Array** _self )
{
    Array* self = *_self;

    if ( self )
    {
        if ( self->objects ) Runtime_Free( self->objects );

        self->objects = 0;
        self->length  = 0;
        self->size    = 0;

        self = Runtime_Free( self );
    }

    *_self = 0;

    return 0;
}

Array.push

Array* Array_push( Array* self, void* object )
{
    if ( self->length == self->size )
    {
        Array_expand( self );
    }

    self->objects[self->length++] = object;

    return self;
}

Array.shift

void* Array_shift( Array* self )
{
    if ( self->length )
    {
        void* head = self->objects[0];

        for ( int i=1; i < self->length; i++ )
        {    
            self->objects[i-1] = self->objects[i];
            self->objects[i]   = 0;
        }
        self->length--;
        return head;
    }
    else
    {
        return NULL;
    }
}

Array.unshift

Array* Array_unshift( Array* self, void* object )
{
    if ( self->length == self->size )
    {
        Array_expand( self );
    }

    for ( int i=self->length; 0 < i; i-- )
    {    
        self->objects[i]   = self->objects[i-1];
        self->objects[i-1] = 0;
    }
    self->objects[0] = object;

    self->length++;

    return self;
}

Array.length

int Array_length( Array* self )
{
    return self->length;
}

Base

#ifndef LIBTOKENIZER_BASE_H
#define LIBTOKENIZER_BASE_H

#ifndef bool
#define bool int
#endif

#ifndef TRUE
#define TRUE 1
#endif

#ifndef FALSE
#define FALSE 0
#endif

#ifndef NULL
#define NULL 0
#endif

#ifndef REF
#define REF 0
#endif

#endif

String

Example Usage

#include <stdio.h>
#include "libtokenizer/String.h"

int main( int argc, char** argv )
{
    String* test = String_new( NULL, "Test" );

    String_free( &test );

    if ( 0 != test )
    {
        printf( "Can still see string.\n" );
    }
}

Class Definition

#ifndef LIBTOKENIZER_STRING_H
#define LIBTOKENIZER_STRING_H

#include "libtokenizer/Base.h"

typedef struct _String
{
    char* content;
    int   length;

} String;

String*     String_new    ( String*  self, const char* content );
String*     String_free   ( String** self );

const char* String_content( const String* self );
int         String_length ( const String* self );
String*     String_copy   ( const String* self );
String*     String_cat    ( const String* self, const String* other );
bool        String_equals ( const String* self, const String* other );

int   StringLength( const char* s                  );
char* StringCopy  ( const char* s                  );
char* StringCat   ( const char* s1, const char* s2 );
bool  StringEquals( const char* s1, const char* s2 );

#endif
#include <stdlib.h>
#include <string.h>

#include "libtokenizer/Runtime.h"
#include "libtokenizer/String.h"

String* String_new( String* self, const char* content )
{
    if ( !self ) self = Runtime_Calloc( 1, sizeof( String ) );

    if ( self )
    {
        self->content = StringCopy  ( content );
        self->length  = StringLength( content );
    }
    return self;
}

String* String_free( String** _self )
{
    String* self = *_self;

    if ( self )
    {
        free( self->content ); self->content = 0;
        self->length = 0;

        self = Runtime_Free( self );
    }

    *_self = 0;

    return self;
}

const char* String_content( const String* self )
{
    return self->content;
}

int String_length( const String* self )
{
    return self->length;
}

String* String_copy( const String* self )
{
    return String_new( NULL, self->content );
}

String* String_cat( const String* self, const String* other )
{
    char* tmp = StringCat( self->content, other->content );
    String* ret = String_new( NULL, tmp );
    free( tmp );

    return ret;
}

bool String_equals( const String* self, const String* other )
{
    return StringEquals( self->content, other->content );
}

int StringLength( const char* s )
{
    return strlen( s );
}

char* StringCopy( const char* s )
{
    int   len  = StringLength( s ) + 2;
    char* copy = calloc( len, sizeof( char ) );

    return strcpy( copy, s );
}

char* StringCat( const char* s1, const char* s2 )
{
    int len1 = StringLength( s1 );
    int len2 = StringLength( s2 );
    int len  = len1 + len2 + 1;

    char* concatenated = calloc( len, sizeof( char ) );

    int t=0;

    for ( int i=0; i < len1; i++ )
    {
        concatenated[t++] = s1[i];
    }

    for ( int i=0; i < len2; i++ )
    {
        concatenated[t++] = s2[i];
    }

    concatenated[t] = '\0';

    return concatenated;
}

bool StringEquals( const char* s1, const char* s2 )
{
    return (0 == strcmp( s1, s2 ));    
}
public class
{




}
#ifndef LIBTOKENIZER_INPUTSTREAM_H
#define LIBTOKENIZER_INPUTSTREAM_H

typedef struct _InputStream
{
    const char* filepath;
    void* f;

} InputStream;

InputStream* InputStream_new ( const char*  filepath );
InputStream* InputStream_free( InputStream* self     );
int          InputStream_read( InputStream* self     );

#endif
#include <stdio.h>

#include "Libtokenizer/InputStream.h"
#include "Libtokenizer/String.h"

InputStream* InputStream_new( const char* filepath )
{
    InputStream* self = calloc( 1, sizeof( InputStream ) );

    if ( self )
    {
        self->filepath = StringCopy( filepath );
    }
    return self;
}
function Enum( array )
{
    for ( var i in array )
    {
        this[array[i]] = array[i];
    }
}

File

Example Usage

#include <stdlib.h>
#include <stdio.h>

#include "libtokenizer/File.h"

int main( int argc, char** argv )
{
    const char* filepath = "./test/Sample.txt";

    if ( ! File_Exists( filepath ) )
    {
        fprintf( stderr, "Could not find file: %s\n", filepath );
        fflush( stderr );
    }
    else
    {
        char* content = File_Get_Contents( filepath );

        fprintf( stdout, "%s\n", content );

        free( content );
    }

    return 0;
}

Class Definitions

#ifndef LIBTOKENIZER_FILE_H
#define LIBTOKENIZER_FILE_H

#include "libtokenizer/Base.h"

bool  File_Exists      ( const char* filepath );
char* File_Get_Contents( const char* filepath );

#endif

Class Methods

File Exists

bool File_Exists( const char* filepath )
{
    struct stat buf;

    return (0 == lstat( filepath, &buf ));
}

File Get Contents

char* File_Get_Contents( const char* filepath )
{
    char* content = NULL;
    FILE* fp      = fopen( filepath, "r" );

    if ( fp )
    {
        struct stat buf;

        if( 0 == lstat( filepath, &buf ) )
        {
            int size = buf.st_size;

            content = calloc( size + 1, sizeof( char ) );

            int red = fread( content, size, 1, fp );
        }
    }
    return content;
}

Pushback Reader

#include <stdlib.h>
#include <stdio.h>

#include "libtokenizer/File.h"
#include "libtokenizer/PushbackReader.h"
#include "libtokenizer/Runtime.h"
#include "libtokenizer/String.h"

int main( int argc, char** argv )
{
    const char* filepath = "./test/Sample.txt";

    char*           c = File_Get_Contents ( filepath );
    PushbackReader* r = PushbackReader_new( filepath );
    {
        int  len = StringLength( c );
        int  i   = 0;
        char ch;

        while ( 0 != (ch = PushbackReader_read( r )) )
        {
            if ( i == len )
            {
                fprintf( stderr, "Exceeded filelength!!!" );
                exit( -1 );
            }

            if ( c[i] != ch )
            {
                fprintf( stderr, "Character mismatch: %x != %x\n", c[i], ch );
                exit( -1 );
            }

            fprintf( stdout, "#" );

            int rnum = rand();

            if ( rnum < (RAND_MAX/2) )
            {
                fprintf( stdout, "<>" );

                PushbackReader_pushback( r );
                PushbackReader_pushback( r );
                PushbackReader_pushback( r );

                PushbackReader_read( r );
                PushbackReader_read( r );
                PushbackReader_read( r );
            }
            i++;
        }
        fprintf( stdout, "\n" );
    }
    PushbackReader_free( &r );

    if ( Runtime_Allocated() )
    {
        fprintf( stderr, "Memory leak: %i\n", Runtime_Allocated() );
    }

    return 0;
}
#ifndef LIBTOKENIZER_PUSHBACKREADER_H
#define LIBTOKENIZER_PUSHBACKREADER_H

typedef struct _PushbackReader
{
    char* content;
    int   head;
    int   length;

} PushbackReader;

PushbackReader* PushbackReader_new     ( const char*      filepath );
PushbackReader* PushbackReader_free    ( PushbackReader** self     );
int             PushbackReader_read    ( PushbackReader*  self     );
PushbackReader* PushbackReader_pushback( PushbackReader*  self     );

#endif
#include <stdlib.h>
#include <stdio.h>
#include "libtokenizer/File.h"
#include "libtokenizer/PushbackReader.h"
#include "libtokenizer/Runtime.h"
#include "libtokenizer/String.h"

PushbackReader* PushbackReader_new( const char* filepath )
{
    PushbackReader* self = Runtime_Calloc( 1, sizeof( PushbackReader ) );

    if ( self )
    {
        self->head = 0;

        if ( File_Exists( filepath ) )
        {
            self->content = File_Get_Contents( filepath );
            self->length  = StringLength( self->content );
        }
        else
        {
            self->content = StringCopy( "" );
            self->length  = 0;
        }
    }
    return self;
}

PushbackReader* PushbackReader_free( PushbackReader** self )
{
    if ( *self )
    {
        free( (*self)->content ); (*self)->content = 0;
        (*self)->length = 0;

        (*self) = Runtime_Free( (*self) );
    }
    return (*self);
}

int PushbackReader_read( PushbackReader* self )
{
    return (self && (self->head < self->length)) ? self->content[self->head++] : 0;
}

PushbackReader* PushbackReader_pushback( PushbackReader* self )
{
    self->head--;
    return self;
}
export function PushbackReader( content )
{
    this.content  = content;
    this.head     = 0;
    this.delta    = 0;
}
PushbackReader.prototype.read
=
function()
{
    var ch = this.content[this.head++];

    this.delta = 1;

    if ( "\\" == ch )
    {
        ch += this.content[this.head++];
        this.delta = 2;
    }
    else
    if ( "'" == ch )
    {
        if ( this.content[this.head] )
        {
            ch += this.content[this.head++];
            this.delta++;
        }

        if ( this.content[this.head] )
        {
            ch += this.content[this.head++];
            this.delta++;
        }
    }

    return ch;
}
PushbackReader.prototype.pushback
=
function()
{
    this.head -= this.delta;

    this.delta = 0;
}

Queue

Example usage

#include <stdlib.h>
#include <stdio.h>
#include "libtokenizer/Queue.h"
#include "libtokenizer/Runtime.h"
#include "libtokenizer/String.h"

int main( int argc, char** argv )
{
    Queue* q1 = Queue_new();
    Queue* q2 = Queue_new();

    int len;

    if ( 1 )
    {
        for ( int i=0; i < 100; i++ )
        {
            char* test = StringCopy( "test" );

            Queue_addTail( q1, test );
        }

        len = Queue_getLength( q1 );

        fprintf( stdout, "Removing %i items\n", len );
        for ( int i=0; i < len; i++ )
        {
            char* test = (char*) Queue_removeHead( q1 );

            fprintf( stdout, "%2i: %s\n", i, test );

            Queue_addHead( q2, test );
        }
        len = Queue_getLength( q1 );

        fprintf( stdout, "%i items left\n", len );
    }
    q1 = Queue_free( q1 );

    if ( 1 )
    {
        len = Queue_getLength( q2 );
        fprintf( stdout, "Removing %i items from target\n", len );
        for ( int i=0; i < len; i++ )
        {
            char* test = (char*) Queue_removeHead( q2 );

            fprintf( stdout, "%2i: %s\n", i, test );

            free( test );
        }
        len = Queue_getLength( q2 );

        fprintf( stdout, "%i items left\n", len );

        if ( 0 != len )
        {
            fprintf( stderr, "Unusual circumstance: length\n" );
            exit( -1 );
        }

        if ( (char*) Queue_removeHead( q2 ) )
        {
            fprintf( stderr, "Unusual circumstance: head\n" );
            exit( -1 );
        }        

        fprintf( stdout, "%i items left\n", len );
    }
    q2 = Queue_free( q2 );

    if ( Runtime_Allocated() )
    {
        fprintf( stderr, "Memory leak: %i\n", Runtime_Allocated() );
    }
}

C

#ifndef LIBTOKENIZER_QUEUE_H
#define LIBTOKENIZER_QUEUE_H

#include "libtokenizer/Array.h"

typedef struct _Queue
{
    Array* inner;

} Queue;

Queue* Queue_new       ();
Queue* Queue_free      ( Queue* self );
Queue* Queue_addHead   ( Queue* self, void* object );
Queue* Queue_addTail   ( Queue* self, void* object );
void*  Queue_removeHead( Queue* self );
int    Queue_getLength ( Queue* self );

#endif
#include <stdlib.h>
#include "libtokenizer/Queue.h"
#include "libtokenizer/Runtime.h"

static void allocArray( Queue* self )
{
    if ( !self->inner )
    {
        self->inner = Array_new( NULL );
    }
}

Queue* Queue_new()
{
    Queue* self = Runtime_Calloc( 1, sizeof( Queue ) );

    if ( self )
    {
        self->inner = Array_new( NULL );
    }
    return self;
}

Queue* Queue_free( Queue* self )
{
    if ( self )
    {
        if ( self->inner ) self->inner = Array_free( &self->inner );
        Runtime_Free( self );
    }
    return 0;
}

Queue* Queue_addHead( Queue* self, void* object )
{
    allocArray( self );

    Array_unshift( self->inner, object );

    return self;
}

Queue* Queue_addTail( Queue* self, void* object )
{
    allocArray( self );

    Array_push( self->inner, object );

    return self;
}

void* Queue_removeHead( Queue* self )
{
    allocArray( self );

    return Array_shift( self->inner );
}

int Queue_getLength( Queue* self )
{
    allocArray( self );

    return Array_length( self->inner );
}

Javascript

function Queue()
{
    this.inner = Array();
}

Queue.prototype.addTail
=
function( object )
{
    this.inner.push( object );
}

Queue.prototype.removeHead
=
function()
{
    return this.inner.shift();
}

Queue.prototype.addHead
=
function( object )
{
    this.inner.unshift( object );
}

Queue.prototype.getLength
=
function()
{
    return this.inner.length;
}
#ifndef LIBTOKENIZER_RUNTIME_H
#define LIBTOKENIZER_RUNTIME_H

#include <stdlib.h>

void* Runtime_Calloc( size_t count, size_t size );
void* Runtime_Free  ( void* ptr  );
int   Runtime_Allocated();

#endif
#include <stdlib.h>
#include "libtokenizer/Base.h"
#include "libtokenizer/Runtime.h"

static int allocated = 0;

void* Runtime_Calloc( size_t count, size_t size )
{
    allocated++;

    return calloc( count, size );
}

void* Runtime_Free( void* ptr  )
{
    allocated--;

    free( ptr );

    return NULL;
}

int Runtime_Allocated()
{
    return allocated;
}

String Buffer

Example usage

#include <stdio.h>
#include "libtokenizer/Runtime.h"
#include "libtokenizer/StringBuffer.h"

int main( int argc, char** argv )
{
    StringBuffer* sb = StringBuffer_new();

    for ( int i=0; i < 10; i++ )
    {
        StringBuffer_append( sb, "test" );

        const char* content = StringBuffer_content( sb );

        fprintf( stdout, "%2i: %s\n", i, content );
    }

    StringBuffer_free( sb );

    if ( Runtime_Allocated() )
    {
        fprintf( stderr, "Memory leak: %i\n", Runtime_Allocated() );
    }
}

Class definitions

public class
{
    @content: string*
}
#ifndef LIBTOKENIZER_STRINGBUFFER_H
#define LIBTOKENIZER_STRINGBUFFER_H

#include "libtokenizer/Base.h"

typedef struct _StringBuffer
{
    char* content;
    int   length;

} StringBuffer;

StringBuffer* StringBuffer_new        ();
StringBuffer* StringBuffer_free       ( StringBuffer* self                     );
StringBuffer* StringBuffer_append     ( StringBuffer* self, const char* suffix );
StringBuffer* StringBuffer_append_char( StringBuffer* self, char        ch     );
const char*   StringBuffer_content    ( StringBuffer* self                     );
bool          StringBuffer_isEmpty    ( StringBuffer* self                     );

#endif
function StringBuffer()
{
    this.inner = "";
}
public class StringBuffer {

    java.lang.StringBuffer inner;

Constructors

#include <stdlib.h>
#include "libtokenizer/Runtime.h"
#include "libtokenizer/String.h"
#include "libtokenizer/StringBuffer.h"

StringBuffer* StringBuffer_new()
{
    StringBuffer* self = Runtime_Calloc( 1, sizeof( StringBuffer ) );

    if ( self )
    {
        self->content = StringCopy( "" );
        self->length  = 0;
    }
    return self;
}
function StringBuffer()
{
    this.inner = "";
}

Deconstructors

StringBuffer* StringBuffer_free( StringBuffer* self )
{
    free( self->content );
    self->length = 0;
    Runtime_Free( self );

    return 0;
}

StringBuffer.append

public append( suffix: string& )
{
    @content = @content.concat( suffix );
}
StringBuffer* StringBuffer_append( StringBuffer* self, const char* suffix )
{
    self->length += StringLength( suffix );
    char* tmp = self->content;
    self->content = StringCat( tmp, suffix );
    free( tmp );

    return self;
}

StringBuffer* StringBuffer_append_char( StringBuffer* self, char ch )
{
    char suffix[2] = { ch , '\0' };

    return StringBuffer_append( self, suffix );
}
StringBuffer.prototype.append
=
function( string )
{
    this.inner += string;
}

StringBuffer.content

const char* StringBuffer_content( StringBuffer* self )
{
    return self->content;
}

StringBuffer.isEmpty

bool StringBuffer_isEmpty( StringBuffer* self )
{
    return (0 == StringLength( self->content ));
}

Term

Class Definitions

#ifndef LIBTOKENIZER_TERM_H
#define LIBTOKENIZER_TERM_H

#define COLOR_NORMAL   "\033[00m"
#define COLOR_BOLD     "\033[01m"
#define COLOR_LIGHT    "\033[02m"
#define COLOR_STRING   "\033[33m"
#define COLOR_TYPE     "\033[36m"
#define COLOR_MODIFIER "\033[94m"
#define COLOR_VALUE    "\033[33m"
#define COLOR_CHAR     "\033[33m"
#define COLOR_COMMENT  "\033[32m"
#define COLOR_UNKNOWN  "\033[41m"

void Term_Colour( void* stream, const char* color );

#endif

Term.Colour

#include <stdio.h>

void Term_Colour( void* stream, const char* color )
{
    fprintf( stream, "%s", color );
}

Make

CFLAGS=-O0
BIN=bin
INC=include
LIB=lib
OBJ=obj
SRC=src

all: dirs c js

dirs:
	mkdir -p $(BIN) $(OBJ) $(LIB)

c: dirs
	mkdir -p $(OBJ)/c
	cc -c $(CFLAGS) -I$(INC) -o $(OBJ)/c/Runtime.o         $(SRC)/c/Runtime.c
	cc -c $(CFLAGS) -I$(INC) -o $(OBJ)/c/Term.o            $(SRC)/c/Term.c
	cc -c $(CFLAGS) -I$(INC) -o $(OBJ)/c/Array.o           $(SRC)/c/Array.c
	cc -c $(CFLAGS) -I$(INC) -o $(OBJ)/c/File.o            $(SRC)/c/File.c
	cc -c $(CFLAGS) -I$(INC) -o $(OBJ)/c/Queue.o           $(SRC)/c/Queue.c
	cc -c $(CFLAGS) -I$(INC) -o $(OBJ)/c/String.o          $(SRC)/c/String.c
	cc -c $(CFLAGS) -I$(INC) -o $(OBJ)/c/StringBuffer.o    $(SRC)/c/StringBuffer.c
	cc -c $(CFLAGS) -I$(INC) -o $(OBJ)/c/PushbackReader.o  $(SRC)/c/PushbackReader.c
	cc -c $(CFLAGS) -I$(INC) -o $(OBJ)/c/TokenGroup.o      $(SRC)/c/TokenGroup.c
	cc -c $(CFLAGS) -I$(INC) -o $(OBJ)/c/Token.o           $(SRC)/c/Token.c
	cc -c $(CFLAGS) -I$(INC) -o $(OBJ)/c/Tokenizer.o       $(SRC)/c/Tokenizer.c

js:
	mkdir -p $(LIB)/js
	cat $(SRC)/js/*.js > $(LIB)/js/libtokenizer.js

tests:
	cc $(CFLAGS) -I$(INC) -o $(BIN)/testArray          $(SRC)/c/testArray.c          $(OBJ)/c/*.o
	cc $(CFLAGS) -I$(INC) -o $(BIN)/testFile           $(SRC)/c/testFile.c           $(OBJ)/c/*.o
	cc $(CFLAGS) -I$(INC) -o $(BIN)/testQueue          $(SRC)/c/testQueue.c          $(OBJ)/c/*.o
	cc $(CFLAGS) -I$(INC) -o $(BIN)/testString         $(SRC)/c/testString.c         $(OBJ)/c/*.o
	cc $(CFLAGS) -I$(INC) -o $(BIN)/testStringBuffer   $(SRC)/c/testStringBuffer.c   $(OBJ)/c/*.o
	cc $(CFLAGS) -I$(INC) -o $(BIN)/testPushbackReader $(SRC)/c/testPushbackReader.c $(OBJ)/c/*.o
	cc $(CFLAGS) -I$(INC) -o $(BIN)/testToken          $(SRC)/c/testToken.c          $(OBJ)/c/*.o
	cc $(CFLAGS) -I$(INC) -o $(BIN)/testTokenizer      $(SRC)/c/testTokenizer.c      $(OBJ)/c/*.o

.PHONY: test
test:
	$(BIN)/testArray          > /dev/null
	$(BIN)/testFile           > /dev/null
	$(BIN)/testQueue          > /dev/null
	$(BIN)/testString         > /dev/null
	$(BIN)/testStringBuffer   > /dev/null
	$(BIN)/testPushbackReader > /dev/null
	$(BIN)/testToken          > /dev/null
	$(BIN)/testTokenizer      > /dev/null