forked from KolibriOS/kolibrios
Added new port TinyBasic
(An improved version in conjunction with ktcc can generate executable files.) git-svn-id: svn://kolibrios.org@8733 a494cfbc-eb01-0410-851d-a64ba20cac60
This commit is contained in:
602
programs/develop/tinybasic-1.0.4/src/tokeniser.c
Normal file
602
programs/develop/tinybasic-1.0.4/src/tokeniser.c
Normal file
@@ -0,0 +1,602 @@
|
||||
/*
|
||||
* Tiny BASIC
|
||||
* Tokenisation module
|
||||
*
|
||||
* Copyright (C) Damian Gareth Walker 2019
|
||||
* Created: 04-Aug-2019
|
||||
*/
|
||||
|
||||
|
||||
/* included headers */
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "token.h"
|
||||
#include "tokeniser.h"
|
||||
#include "common.h"
|
||||
|
||||
|
||||
/*
|
||||
* Data definitions
|
||||
*/
|
||||
|
||||
|
||||
/* modes of reading */
|
||||
typedef enum {
|
||||
DEFAULT_MODE, /* we have no idea what's coming */
|
||||
COMMENT_MODE, /* reading a comment */
|
||||
WORD_MODE, /* reading an identifier or keyword */
|
||||
NUMBER_MODE, /* reading a numeric constant */
|
||||
LESS_THAN_MODE, /* reading an operator staring with < */
|
||||
GREATER_THAN_MODE, /* reading an operator starting with > */
|
||||
STRING_LITERAL_MODE, /* reading a string literal */
|
||||
UNKNOWN_MODE /* we are lost */
|
||||
} Mode;
|
||||
|
||||
/* current state information */
|
||||
typedef struct {
|
||||
Token *token; /* token to return */
|
||||
Mode mode; /* current reading mode */
|
||||
int ch; /* last-read character */
|
||||
char *content; /* content of token under construction */
|
||||
int max; /* memory reserved for content */
|
||||
} TokeniserState;
|
||||
|
||||
/* Private data */
|
||||
typedef struct {
|
||||
FILE *input; /* the input file */
|
||||
int line, /* current line in the input file */
|
||||
pos, /* current position on the input line */
|
||||
start_line, /* line on which a token started */
|
||||
start_pos; /* position on which a token started */
|
||||
} Private;
|
||||
|
||||
|
||||
/*
|
||||
* File level variables
|
||||
*/
|
||||
|
||||
|
||||
/* convenience variables */
|
||||
static TokenStream *this; /* token stream passed in to public method */
|
||||
static Private *data; /* private data for this */
|
||||
|
||||
|
||||
/*
|
||||
* Level 2 Tokeniser Routines
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* Read a character and update the position counter
|
||||
* globals:
|
||||
* int line current line after character read
|
||||
* int pos current character position after character read
|
||||
* params:
|
||||
* TokeniserState* state current state of the tokeniser
|
||||
* returns:
|
||||
* int character just read
|
||||
*/
|
||||
static int read_character (TokeniserState *state) {
|
||||
|
||||
int ch; /* character read from stream */
|
||||
|
||||
/* read the character */
|
||||
ch = fgetc (data->input);
|
||||
|
||||
/* update the position and line counters */
|
||||
if (ch == '\n') {
|
||||
++data->line;
|
||||
data->pos = 0;
|
||||
} else {
|
||||
++data->pos;
|
||||
}
|
||||
|
||||
/* return the character */
|
||||
return ch;
|
||||
}
|
||||
|
||||
/*
|
||||
* Push a character back into the input stream and update position markers
|
||||
* globals:
|
||||
* int line line number rolled back
|
||||
* int pos character position rolled back
|
||||
* params:
|
||||
* TokeniserState* state current state of the tokeniser
|
||||
*/
|
||||
static void unread_character (TokeniserState *state) {
|
||||
ungetc (state->ch, data->input);
|
||||
if (state->ch == '\n')
|
||||
--data->line;
|
||||
else
|
||||
--data->pos;
|
||||
}
|
||||
|
||||
/*
|
||||
* Append the last read character to the token content
|
||||
* params:
|
||||
* TokeniserState* state current state of the tokeniser
|
||||
*/
|
||||
static void store_character (TokeniserState *state) {
|
||||
|
||||
/* variable declarations */
|
||||
char *temp; /* temporary pointer to content */
|
||||
int length; /* current length of token */
|
||||
|
||||
/* allocate more memory for the token content if necessary */
|
||||
if (strlen (state->content) == state->max - 1) {
|
||||
temp = state->content;
|
||||
state->max *= 2;
|
||||
state->content = malloc (state->max);
|
||||
strcpy (state->content, temp);
|
||||
free (temp);
|
||||
}
|
||||
|
||||
/* now add the character to the token */
|
||||
length = strlen (state->content);
|
||||
state->content [length++] = state->ch;
|
||||
state->content [length] = '\0';
|
||||
}
|
||||
|
||||
/*
|
||||
* Identify the various recognised symbols
|
||||
* params:
|
||||
* int ch the character to identify
|
||||
* returns:
|
||||
* TokenClass the token class recognised by the parser
|
||||
*/
|
||||
static TokenClass identify_symbol (int ch) {
|
||||
switch (ch) {
|
||||
case '+':
|
||||
return TOKEN_PLUS;
|
||||
break;
|
||||
case '-':
|
||||
return TOKEN_MINUS;
|
||||
break;
|
||||
case '*':
|
||||
return TOKEN_MULTIPLY;
|
||||
break;
|
||||
case '/':
|
||||
return TOKEN_DIVIDE;
|
||||
break;
|
||||
case '=':
|
||||
return TOKEN_EQUAL;
|
||||
break;
|
||||
case '(':
|
||||
return TOKEN_LEFT_PARENTHESIS;
|
||||
break;
|
||||
case ')':
|
||||
return TOKEN_RIGHT_PARENTHESIS;
|
||||
break;
|
||||
case ',':
|
||||
return TOKEN_COMMA;
|
||||
break;
|
||||
default:
|
||||
return TOKEN_SYMBOL;
|
||||
}
|
||||
}
|
||||
|
||||
static TokenClass identify_word (char *word) {
|
||||
if (strlen (word) == 1)
|
||||
return TOKEN_VARIABLE;
|
||||
else if (! tinybasic_strcmp (word, "LET"))
|
||||
return TOKEN_LET;
|
||||
else if (! tinybasic_strcmp (word, "IF"))
|
||||
return TOKEN_IF;
|
||||
else if (! tinybasic_strcmp (word, "THEN"))
|
||||
return TOKEN_THEN;
|
||||
else if (! tinybasic_strcmp (word, "GOTO"))
|
||||
return TOKEN_GOTO;
|
||||
else if (! tinybasic_strcmp (word, "GOSUB"))
|
||||
return TOKEN_GOSUB;
|
||||
else if (! tinybasic_strcmp (word, "RETURN"))
|
||||
return TOKEN_RETURN;
|
||||
else if (! tinybasic_strcmp (word, "END"))
|
||||
return TOKEN_END;
|
||||
else if (! tinybasic_strcmp (word, "PRINT"))
|
||||
return TOKEN_PRINT;
|
||||
else if (! tinybasic_strcmp (word, "INPUT"))
|
||||
return TOKEN_INPUT;
|
||||
else if (! tinybasic_strcmp (word, "REM"))
|
||||
return TOKEN_REM;
|
||||
else
|
||||
return TOKEN_WORD;
|
||||
}
|
||||
|
||||
/*
|
||||
* Identify compound (multi-character) symbols.
|
||||
* Also identifies some single-character symbols that can form
|
||||
* the start of multi-character symbols.
|
||||
* params:
|
||||
* char* symbol the symbol to identify
|
||||
* returns:
|
||||
* TokenClass the identification
|
||||
*/
|
||||
static TokenClass identify_compound_symbol (char *symbol) {
|
||||
if (! strcmp (symbol, "<>")
|
||||
|| ! strcmp (symbol, "><"))
|
||||
return TOKEN_UNEQUAL;
|
||||
else if (! strcmp (symbol, "<"))
|
||||
return TOKEN_LESSTHAN;
|
||||
else if (! strcmp (symbol, "<="))
|
||||
return TOKEN_LESSOREQUAL;
|
||||
else if (! strcmp (symbol, ">"))
|
||||
return TOKEN_GREATERTHAN;
|
||||
else if (! strcmp (symbol, ">="))
|
||||
return TOKEN_GREATEROREQUAL;
|
||||
else
|
||||
return TOKEN_SYMBOL;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Level 1 Tokeniser Routines
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* Default mode - deal with character when state is unknown
|
||||
* globals:
|
||||
* int line current line in the source file
|
||||
* int pos current character position in the source
|
||||
* int start_line line on which the current token started
|
||||
* int start_pos char pos on which the current token started
|
||||
* params:
|
||||
* TokeniserState* state current state of the tokeniser
|
||||
*/
|
||||
static void default_mode (TokeniserState *state) {
|
||||
|
||||
/* deal with non-EOL whitespace */
|
||||
if (state->ch == ' ' ||
|
||||
state->ch == '\t') {
|
||||
state->ch = read_character (state);
|
||||
data->start_line = data->line;
|
||||
data->start_pos = data->pos;
|
||||
}
|
||||
|
||||
/* deal with EOL whitespace */
|
||||
else if (state->ch == '\n') {
|
||||
data->start_line = data->line - 1;
|
||||
data->start_pos = data->pos;
|
||||
state->token = new_Token_init
|
||||
(TOKEN_EOL, data->start_line, data->start_pos, state->content);
|
||||
}
|
||||
|
||||
/* alphabetic characters start a word */
|
||||
else if ((state->ch >= 'A' && state->ch <= 'Z') ||
|
||||
(state->ch >= 'a' && state->ch <= 'z')) {
|
||||
data->start_line = data->line;
|
||||
data->start_pos = data->pos;
|
||||
state->mode = WORD_MODE;
|
||||
}
|
||||
|
||||
/* digits start a number */
|
||||
else if (state->ch >= '0' && state->ch <= '9')
|
||||
state->mode = NUMBER_MODE;
|
||||
|
||||
/* check for tokens starting with less-than (<, <=, <>) */
|
||||
else if (state->ch == '<') {
|
||||
data->start_line = data->line;
|
||||
data->start_pos = data->pos;
|
||||
store_character (state);
|
||||
state->ch = read_character (state);
|
||||
state->mode = LESS_THAN_MODE;
|
||||
}
|
||||
|
||||
/* check for tokens starting with greater-than (>, >=) */
|
||||
else if (state->ch == '>') {
|
||||
data->start_line = data->line;
|
||||
data->start_pos = data->pos;
|
||||
store_character (state);
|
||||
state->ch = read_character (state);
|
||||
state->mode = GREATER_THAN_MODE;
|
||||
}
|
||||
|
||||
/* deal with other symbol operators */
|
||||
else if (strchr ("+-*/=(),", state->ch) != NULL) {
|
||||
data->start_line = data->line;
|
||||
data->start_pos = data->pos;
|
||||
store_character (state);
|
||||
state->token = new_Token_init (identify_symbol (state->ch),
|
||||
data->start_line, data->start_pos, state->content);
|
||||
}
|
||||
|
||||
/* double quotes start a string literal */
|
||||
else if (state->ch == '"') {
|
||||
data->start_line = data->line;
|
||||
data->start_pos = data->pos;
|
||||
state->ch = read_character (state);
|
||||
state->mode = STRING_LITERAL_MODE;
|
||||
}
|
||||
|
||||
/* detect end of file */
|
||||
else if (state->ch == EOF) {
|
||||
data->start_line = data->line;
|
||||
data->start_pos = data->pos;
|
||||
state->token = new_Token_init
|
||||
(TOKEN_EOF, data->start_line, data->start_pos, state->content);
|
||||
}
|
||||
|
||||
/* other characters are illegal */
|
||||
else {
|
||||
data->start_line = data->line;
|
||||
data->start_pos = data->pos;
|
||||
store_character (state);
|
||||
state->token = new_Token_init
|
||||
(TOKEN_ILLEGAL, data->start_line, data->start_pos, state->content);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Word mode - deal with character when building a word token
|
||||
* globals:
|
||||
* int start_line line on which the current token started
|
||||
* int start_pos char pos on which the current token started
|
||||
* params:
|
||||
* TokeniserState* state current state of the tokeniser
|
||||
*/
|
||||
static void word_mode (TokeniserState *state) {
|
||||
|
||||
/* local variables */
|
||||
TokenClass class; /* recognised class of keyword */
|
||||
|
||||
/* add letters and digits to the token */
|
||||
if ((state->ch >= 'A' && state->ch <= 'Z') ||
|
||||
(state->ch >= 'a' && state->ch <= 'z')) {
|
||||
store_character (state);
|
||||
state->ch = read_character (state);
|
||||
}
|
||||
|
||||
/* other characters are pushed back for the next token */
|
||||
else {
|
||||
if (state->ch != EOF)
|
||||
unread_character (state);
|
||||
class = identify_word (state->content);
|
||||
if (class == TOKEN_REM) {
|
||||
*state->content = '\0';
|
||||
state->mode = COMMENT_MODE;
|
||||
}
|
||||
else
|
||||
state->token = new_Token_init
|
||||
(class, data->start_line, data->start_pos, state->content);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Comment mode - skip till end of line after a REM
|
||||
* globals:
|
||||
* int start_line line on which the current token started
|
||||
* int start_pos char pos on which the current token started
|
||||
* params:
|
||||
* TokeniserState* state current state of the tokeniser
|
||||
*/
|
||||
static void comment_mode (TokeniserState *state) {
|
||||
if (state->ch == '\n')
|
||||
state->mode = DEFAULT_MODE;
|
||||
else
|
||||
state->ch = read_character (state);
|
||||
}
|
||||
|
||||
/*
|
||||
* Number mode - building a number token (integer only)
|
||||
* globals:
|
||||
* int start_line line on which the current token started
|
||||
* int start_pos char pos on which the current token started
|
||||
* params:
|
||||
* TokeniserState* state current state of the tokeniser
|
||||
*/
|
||||
static void number_mode (TokeniserState *state) {
|
||||
|
||||
/* add digits to the token */
|
||||
if (state->ch >= '0' && state->ch <= '9') {
|
||||
store_character (state);
|
||||
state->ch = read_character (state);
|
||||
}
|
||||
|
||||
/* other characters are pushed back for the next token */
|
||||
else {
|
||||
if (state->ch != EOF)
|
||||
unread_character (state);
|
||||
state->token = new_Token_init
|
||||
(TOKEN_NUMBER, data->start_line, data->start_pos, state->content);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Less than mode - checking for <> and <= operators
|
||||
* globals:
|
||||
* int start_line line on which the current token started
|
||||
* int start_pos char pos on which the current token started
|
||||
* params:
|
||||
* TokeniserState* state current state of the tokeniser
|
||||
*/
|
||||
static void less_than_mode (TokeniserState *state) {
|
||||
if (state->ch == '=' || state->ch == '>')
|
||||
store_character (state);
|
||||
else
|
||||
unread_character (state);
|
||||
state->token = new_Token_init
|
||||
(identify_compound_symbol (state->content), data->start_line,
|
||||
data->start_pos, state->content);
|
||||
}
|
||||
|
||||
/*
|
||||
* Greater than mode - checking for >= and >< operators
|
||||
* globals:
|
||||
* int start_line line on which the current token started
|
||||
* int start_pos char pos on which the current token started
|
||||
* params:
|
||||
* TokeniserState* state current state of the tokeniser
|
||||
*/
|
||||
static void greater_than_mode (TokeniserState *state) {
|
||||
if (state->ch == '=' || state->ch == '<')
|
||||
store_character (state);
|
||||
else
|
||||
ungetc (state->ch, data->input);
|
||||
state->token = new_Token_init
|
||||
(identify_compound_symbol (state->content), data->start_line,
|
||||
data->start_pos, state->content);
|
||||
}
|
||||
|
||||
/*
|
||||
* String literal mode - reading a string
|
||||
* globals:
|
||||
* int start_line line on which the current token started
|
||||
* int start_pos char pos on which the current token started
|
||||
* params:
|
||||
* TokeniserState* state current state of the tokeniser
|
||||
*/
|
||||
static void string_literal_mode (TokeniserState *state) {
|
||||
|
||||
/* a quote terminates the string */
|
||||
if (state->ch == '"')
|
||||
state->token = new_Token_init
|
||||
(TOKEN_STRING, data->start_line, data->start_pos, state->content);
|
||||
|
||||
/* a backslash escapes the next character */
|
||||
else if (state->ch == '\\') {
|
||||
state->ch = read_character (state);
|
||||
store_character (state);
|
||||
state->ch = read_character (state);
|
||||
}
|
||||
|
||||
/* EOF generates an error */
|
||||
else if (state->ch == EOF)
|
||||
state->token = new_Token_init
|
||||
(TOKEN_ILLEGAL, data->start_line, data->start_pos, state->content);
|
||||
|
||||
/* all other characters are part of the string */
|
||||
else {
|
||||
store_character (state);
|
||||
state->ch = read_character (state);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Top Level Tokeniser Routines
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* Get the next token
|
||||
* params:
|
||||
* TokenStream* token_stream the token stream being processed
|
||||
* returns:
|
||||
* Token* the token built
|
||||
*/
|
||||
static Token *next (TokenStream *token_stream) {
|
||||
|
||||
/* local variables */
|
||||
TokeniserState state; /* current state of reading */
|
||||
Token *return_token; /* token to return */
|
||||
|
||||
/* initialise */
|
||||
this = token_stream;
|
||||
data = this->data;
|
||||
state.token = NULL;
|
||||
state.mode = DEFAULT_MODE;
|
||||
state.max = 1024;
|
||||
state.content = malloc (state.max);
|
||||
*(state.content) = '\0';
|
||||
state.ch = read_character (&state);
|
||||
/* main loop */
|
||||
while (state.token == NULL) {
|
||||
switch (state.mode) {
|
||||
case DEFAULT_MODE:
|
||||
|
||||
default_mode (&state);
|
||||
break;
|
||||
case COMMENT_MODE:
|
||||
comment_mode (&state);
|
||||
break;
|
||||
case WORD_MODE:
|
||||
word_mode (&state);
|
||||
break;
|
||||
case NUMBER_MODE:
|
||||
number_mode (&state);
|
||||
break;
|
||||
case LESS_THAN_MODE:
|
||||
less_than_mode (&state);
|
||||
break;
|
||||
case GREATER_THAN_MODE:
|
||||
greater_than_mode (&state);
|
||||
break;
|
||||
case STRING_LITERAL_MODE:
|
||||
string_literal_mode (&state);
|
||||
break;
|
||||
default:
|
||||
state.token = new_Token_init
|
||||
(TOKEN_EOF, data->start_line, data->start_pos, state.content);
|
||||
state.ch = EOF; /* temporary hack */
|
||||
}
|
||||
}
|
||||
|
||||
/* store token and release state memory */
|
||||
return_token = state.token;
|
||||
free (state.content);
|
||||
|
||||
/* return result */
|
||||
return return_token;
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* Getter for the current line number
|
||||
* paramss:
|
||||
* TokenStream* token_stream the token stream being processed
|
||||
* returns:
|
||||
* int the current line number returned
|
||||
*/
|
||||
static int get_line (TokenStream *token_stream) {
|
||||
this = token_stream;
|
||||
data = this->data;
|
||||
return data->line;
|
||||
}
|
||||
|
||||
/*
|
||||
* Destructor for a TokenStream
|
||||
* params:
|
||||
* TokenStream* token_stream the doomed token stream
|
||||
*/
|
||||
static void destroy (TokenStream *token_stream) {
|
||||
if (token_stream) {
|
||||
if (token_stream->data)
|
||||
free (token_stream->data);
|
||||
free (token_stream);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Constructors
|
||||
*/
|
||||
|
||||
|
||||
/*
|
||||
* Constructor for TokenStream
|
||||
* params:
|
||||
* FILE* input Input file
|
||||
* returns:
|
||||
* TokenStream* The new token stream
|
||||
*/
|
||||
TokenStream *new_TokenStream (FILE *input) {
|
||||
|
||||
/* allocate the memory */
|
||||
this = malloc (sizeof (TokenStream));
|
||||
this->data = data = malloc (sizeof (Private));
|
||||
|
||||
/* initialise methods */
|
||||
this->next = next;
|
||||
this->get_line = get_line;
|
||||
this->destroy = destroy;
|
||||
|
||||
/* initialise data */
|
||||
data->input = input;
|
||||
data->line = data->start_line = 1;
|
||||
data->pos = data->start_pos = 0;
|
||||
|
||||
/* return new token stream */
|
||||
return this;
|
||||
}
|
Reference in New Issue
Block a user