/* * Tiny BASIC * Tokenisation module * * Copyright (C) Damian Gareth Walker 2019 * Created: 04-Aug-2019 */ /* included headers */ #include #include #include #include "token.h" #include "tokeniser.h" #include "common.h" /* * Data definitions */ /* modes of reading */ typedef enum { DEFAULT_MODE, /* we have no idea what's coming */ COMMENT_MODE, /* reading a comment */ WORD_MODE, /* reading an identifier or keyword */ NUMBER_MODE, /* reading a numeric constant */ LESS_THAN_MODE, /* reading an operator staring with < */ GREATER_THAN_MODE, /* reading an operator starting with > */ STRING_LITERAL_MODE, /* reading a string literal */ UNKNOWN_MODE /* we are lost */ } Mode; /* current state information */ typedef struct { Token *token; /* token to return */ Mode mode; /* current reading mode */ int ch; /* last-read character */ char *content; /* content of token under construction */ int max; /* memory reserved for content */ } TokeniserState; /* Private data */ typedef struct { FILE *input; /* the input file */ int line, /* current line in the input file */ pos, /* current position on the input line */ start_line, /* line on which a token started */ start_pos; /* position on which a token started */ } Private; /* * File level variables */ /* convenience variables */ static TokenStream *this; /* token stream passed in to public method */ static Private *data; /* private data for this */ /* * Level 2 Tokeniser Routines */ /* * Read a character and update the position counter * globals: * int line current line after character read * int pos current character position after character read * params: * TokeniserState* state current state of the tokeniser * returns: * int character just read */ static int read_character (TokeniserState *state) { int ch; /* character read from stream */ /* read the character */ ch = fgetc (data->input); /* update the position and line counters */ if (ch == '\n') { ++data->line; data->pos = 0; } else { ++data->pos; } /* return the character */ return ch; } /* * Push a character back into the input stream and update position markers * globals: * int line line number rolled back * int pos character position rolled back * params: * TokeniserState* state current state of the tokeniser */ static void unread_character (TokeniserState *state) { ungetc (state->ch, data->input); if (state->ch == '\n') --data->line; else --data->pos; } /* * Append the last read character to the token content * params: * TokeniserState* state current state of the tokeniser */ static void store_character (TokeniserState *state) { /* variable declarations */ char *temp; /* temporary pointer to content */ int length; /* current length of token */ /* allocate more memory for the token content if necessary */ if (strlen (state->content) == state->max - 1) { temp = state->content; state->max *= 2; state->content = malloc (state->max); strcpy (state->content, temp); free (temp); } /* now add the character to the token */ length = strlen (state->content); state->content [length++] = state->ch; state->content [length] = '\0'; } /* * Identify the various recognised symbols * params: * int ch the character to identify * returns: * TokenClass the token class recognised by the parser */ static TokenClass identify_symbol (int ch) { switch (ch) { case '+': return TOKEN_PLUS; break; case '-': return TOKEN_MINUS; break; case '*': return TOKEN_MULTIPLY; break; case '/': return TOKEN_DIVIDE; break; case '=': return TOKEN_EQUAL; break; case '(': return TOKEN_LEFT_PARENTHESIS; break; case ')': return TOKEN_RIGHT_PARENTHESIS; break; case ',': return TOKEN_COMMA; break; default: return TOKEN_SYMBOL; } } static TokenClass identify_word (char *word) { if (strlen (word) == 1) return TOKEN_VARIABLE; else if (! tinybasic_strcmp (word, "LET")) return TOKEN_LET; else if (! tinybasic_strcmp (word, "IF")) return TOKEN_IF; else if (! tinybasic_strcmp (word, "THEN")) return TOKEN_THEN; else if (! tinybasic_strcmp (word, "GOTO")) return TOKEN_GOTO; else if (! tinybasic_strcmp (word, "GOSUB")) return TOKEN_GOSUB; else if (! tinybasic_strcmp (word, "RETURN")) return TOKEN_RETURN; else if (! tinybasic_strcmp (word, "END")) return TOKEN_END; else if (! tinybasic_strcmp (word, "PRINT")) return TOKEN_PRINT; else if (! tinybasic_strcmp (word, "INPUT")) return TOKEN_INPUT; else if (! tinybasic_strcmp (word, "REM")) return TOKEN_REM; else return TOKEN_WORD; } /* * Identify compound (multi-character) symbols. * Also identifies some single-character symbols that can form * the start of multi-character symbols. * params: * char* symbol the symbol to identify * returns: * TokenClass the identification */ static TokenClass identify_compound_symbol (char *symbol) { if (! strcmp (symbol, "<>") || ! strcmp (symbol, "><")) return TOKEN_UNEQUAL; else if (! strcmp (symbol, "<")) return TOKEN_LESSTHAN; else if (! strcmp (symbol, "<=")) return TOKEN_LESSOREQUAL; else if (! strcmp (symbol, ">")) return TOKEN_GREATERTHAN; else if (! strcmp (symbol, ">=")) return TOKEN_GREATEROREQUAL; else return TOKEN_SYMBOL; } /* * Level 1 Tokeniser Routines */ /* * Default mode - deal with character when state is unknown * globals: * int line current line in the source file * int pos current character position in the source * int start_line line on which the current token started * int start_pos char pos on which the current token started * params: * TokeniserState* state current state of the tokeniser */ static void default_mode (TokeniserState *state) { /* deal with non-EOL whitespace */ if (state->ch == ' ' || state->ch == '\t') { state->ch = read_character (state); data->start_line = data->line; data->start_pos = data->pos; } /* deal with EOL whitespace */ else if (state->ch == '\n') { data->start_line = data->line - 1; data->start_pos = data->pos; state->token = new_Token_init (TOKEN_EOL, data->start_line, data->start_pos, state->content); } /* alphabetic characters start a word */ else if ((state->ch >= 'A' && state->ch <= 'Z') || (state->ch >= 'a' && state->ch <= 'z')) { data->start_line = data->line; data->start_pos = data->pos; state->mode = WORD_MODE; } /* digits start a number */ else if (state->ch >= '0' && state->ch <= '9') state->mode = NUMBER_MODE; /* check for tokens starting with less-than (<, <=, <>) */ else if (state->ch == '<') { data->start_line = data->line; data->start_pos = data->pos; store_character (state); state->ch = read_character (state); state->mode = LESS_THAN_MODE; } /* check for tokens starting with greater-than (>, >=) */ else if (state->ch == '>') { data->start_line = data->line; data->start_pos = data->pos; store_character (state); state->ch = read_character (state); state->mode = GREATER_THAN_MODE; } /* deal with other symbol operators */ else if (strchr ("+-*/=(),", state->ch) != NULL) { data->start_line = data->line; data->start_pos = data->pos; store_character (state); state->token = new_Token_init (identify_symbol (state->ch), data->start_line, data->start_pos, state->content); } /* double quotes start a string literal */ else if (state->ch == '"') { data->start_line = data->line; data->start_pos = data->pos; state->ch = read_character (state); state->mode = STRING_LITERAL_MODE; } /* detect end of file */ else if (state->ch == EOF) { data->start_line = data->line; data->start_pos = data->pos; state->token = new_Token_init (TOKEN_EOF, data->start_line, data->start_pos, state->content); } /* other characters are illegal */ else { data->start_line = data->line; data->start_pos = data->pos; store_character (state); state->token = new_Token_init (TOKEN_ILLEGAL, data->start_line, data->start_pos, state->content); } } /* * Word mode - deal with character when building a word token * globals: * int start_line line on which the current token started * int start_pos char pos on which the current token started * params: * TokeniserState* state current state of the tokeniser */ static void word_mode (TokeniserState *state) { /* local variables */ TokenClass class; /* recognised class of keyword */ /* add letters and digits to the token */ if ((state->ch >= 'A' && state->ch <= 'Z') || (state->ch >= 'a' && state->ch <= 'z')) { store_character (state); state->ch = read_character (state); } /* other characters are pushed back for the next token */ else { if (state->ch != EOF) unread_character (state); class = identify_word (state->content); if (class == TOKEN_REM) { *state->content = '\0'; state->mode = COMMENT_MODE; } else state->token = new_Token_init (class, data->start_line, data->start_pos, state->content); } } /* * Comment mode - skip till end of line after a REM * globals: * int start_line line on which the current token started * int start_pos char pos on which the current token started * params: * TokeniserState* state current state of the tokeniser */ static void comment_mode (TokeniserState *state) { if (state->ch == '\n') state->mode = DEFAULT_MODE; else state->ch = read_character (state); } /* * Number mode - building a number token (integer only) * globals: * int start_line line on which the current token started * int start_pos char pos on which the current token started * params: * TokeniserState* state current state of the tokeniser */ static void number_mode (TokeniserState *state) { /* add digits to the token */ if (state->ch >= '0' && state->ch <= '9') { store_character (state); state->ch = read_character (state); } /* other characters are pushed back for the next token */ else { if (state->ch != EOF) unread_character (state); state->token = new_Token_init (TOKEN_NUMBER, data->start_line, data->start_pos, state->content); } } /* * Less than mode - checking for <> and <= operators * globals: * int start_line line on which the current token started * int start_pos char pos on which the current token started * params: * TokeniserState* state current state of the tokeniser */ static void less_than_mode (TokeniserState *state) { if (state->ch == '=' || state->ch == '>') store_character (state); else unread_character (state); state->token = new_Token_init (identify_compound_symbol (state->content), data->start_line, data->start_pos, state->content); } /* * Greater than mode - checking for >= and >< operators * globals: * int start_line line on which the current token started * int start_pos char pos on which the current token started * params: * TokeniserState* state current state of the tokeniser */ static void greater_than_mode (TokeniserState *state) { if (state->ch == '=' || state->ch == '<') store_character (state); else ungetc (state->ch, data->input); state->token = new_Token_init (identify_compound_symbol (state->content), data->start_line, data->start_pos, state->content); } /* * String literal mode - reading a string * globals: * int start_line line on which the current token started * int start_pos char pos on which the current token started * params: * TokeniserState* state current state of the tokeniser */ static void string_literal_mode (TokeniserState *state) { /* a quote terminates the string */ if (state->ch == '"') state->token = new_Token_init (TOKEN_STRING, data->start_line, data->start_pos, state->content); /* a backslash escapes the next character */ else if (state->ch == '\\') { state->ch = read_character (state); store_character (state); state->ch = read_character (state); } /* EOF generates an error */ else if (state->ch == EOF) state->token = new_Token_init (TOKEN_ILLEGAL, data->start_line, data->start_pos, state->content); /* all other characters are part of the string */ else { store_character (state); state->ch = read_character (state); } } /* * Top Level Tokeniser Routines */ /* * Get the next token * params: * TokenStream* token_stream the token stream being processed * returns: * Token* the token built */ static Token *next (TokenStream *token_stream) { /* local variables */ TokeniserState state; /* current state of reading */ Token *return_token; /* token to return */ /* initialise */ this = token_stream; data = this->data; state.token = NULL; state.mode = DEFAULT_MODE; state.max = 1024; state.content = malloc (state.max); *(state.content) = '\0'; state.ch = read_character (&state); /* main loop */ while (state.token == NULL) { switch (state.mode) { case DEFAULT_MODE: default_mode (&state); break; case COMMENT_MODE: comment_mode (&state); break; case WORD_MODE: word_mode (&state); break; case NUMBER_MODE: number_mode (&state); break; case LESS_THAN_MODE: less_than_mode (&state); break; case GREATER_THAN_MODE: greater_than_mode (&state); break; case STRING_LITERAL_MODE: string_literal_mode (&state); break; default: state.token = new_Token_init (TOKEN_EOF, data->start_line, data->start_pos, state.content); state.ch = EOF; /* temporary hack */ } } /* store token and release state memory */ return_token = state.token; free (state.content); /* return result */ return return_token; } /* * Getter for the current line number * paramss: * TokenStream* token_stream the token stream being processed * returns: * int the current line number returned */ static int get_line (TokenStream *token_stream) { this = token_stream; data = this->data; return data->line; } /* * Destructor for a TokenStream * params: * TokenStream* token_stream the doomed token stream */ static void destroy (TokenStream *token_stream) { if (token_stream) { if (token_stream->data) free (token_stream->data); free (token_stream); } } /* * Constructors */ /* * Constructor for TokenStream * params: * FILE* input Input file * returns: * TokenStream* The new token stream */ TokenStream *new_TokenStream (FILE *input) { /* allocate the memory */ this = malloc (sizeof (TokenStream)); this->data = data = malloc (sizeof (Private)); /* initialise methods */ this->next = next; this->get_line = get_line; this->destroy = destroy; /* initialise data */ data->input = input; data->line = data->start_line = 1; data->pos = data->start_pos = 0; /* return new token stream */ return this; }