kolibrios-fun/programs/media/unrtf/parse.c


/*=============================================================================
   GNU UnRTF, a command-line program to convert RTF documents to other formats.
   Copyright (C) 2000,2001 Zachary Thayer Smith

   This program is free software; you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

   The author is reachable by electronic mail at tuorfa@yahoo.com.
=============================================================================*/


/*----------------------------------------------------------------------
 * Module name:    parse
 * Author name:    Zach Smith
 * Create date:    01 Sep 00
 * Purpose:        Parsing of the RTF file into a structure of Word objects.
 *----------------------------------------------------------------------
 * Changes:
 * 15 Oct 00, tuorfa@yahoo.com: parse.c created with functions taken from word.c
 * 15 Oct 00, tuorfa@yahoo.com: backslash before newline is now \par
 * 08 Apr 01, tuorfa@yahoo.com: removed limit on word length
 * 03 Aug 01, tuorfa@yahoo.com: added input buffering
 * 19 Sep 01, tuorfa@yahoo.com: cleaned up read_word()
 * 22 Sep 01, tuorfa@yahoo.com: moved word_dump() to word.c
 * 22 Sep 01, tuorfa@yahoo.com: added function-level comment blocks 
 *--------------------------------------------------------------------*/

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>

#include "defs.h"
#include "parse.h"
#include "malloc.h"
#include "main.h"
#include "error.h"
#include "word.h"
#include "hash.h"


/* local to getchar stuff */
static int ungot_char=-1;
static int ungot_char2=-1;
static int ungot_char3=-1;


/*========================================================================
 * Name:	my_unget_char
 * Purpose:	My own unget routine, handling up to 3 ungot characters.
 * Args:	Character.
 * Returns:	None.
 *=======================================================================*/

static void my_unget_char (int ch)
{
	if (ungot_char>=0 && ungot_char2>=0 && ungot_char3>=0) 
		error_handler ("more than 3 ungot chars");

	ungot_char3 = ungot_char2;
	ungot_char2 = ungot_char;
	ungot_char = ch;
}


static int last_returned_ch=0;


#define READ_BUF_LEN 2048
static int buffer_size = 0;
static char *read_buf = NULL;
static int read_buf_end = 0;
static int read_buf_index = 0;


/*========================================================================
 * Name:	my_getchar
 * Purpose:	Gets a character: either an ungot one, or a buffered one.
 * Args:	Input file.
 * Returns:	Character, or EOF.
 *=======================================================================*/

static int my_getchar (FILE* f)
{
	int ch;

	CHECK_PARAM_NOT_NULL(f);

	if (ungot_char>=0) {
		ch = ungot_char; 
		ungot_char=ungot_char2; 
		ungot_char2=ungot_char3;
		ungot_char3=-1;
		last_returned_ch = ch;
		return ch;
	}
	do {
		if (read_buf_index >= read_buf_end) {
			if (!read_buf) {
				buffer_size = READ_BUF_LEN;
				read_buf = my_malloc (buffer_size);
				if (!read_buf) {
					buffer_size /= 4;
					read_buf = my_malloc (buffer_size);
					if (!read_buf) 
						error_handler ("cannot allocate read buffer");
				}
			}
			read_buf_end = fread (read_buf, 1, buffer_size, f);
			read_buf_index = 0;
			if (!read_buf_end)
				return EOF;
		}
		ch = read_buf [read_buf_index++];

		if (ch=='\n') {
			lineno++;
			/* Convert \(newline) into \par here */
			if (last_returned_ch=='\\') {
				my_unget_char (' ');
				my_unget_char ('r');
				my_unget_char ('a');
				ch = 'p';
				break;
			}
		}
	} 
	while (ch=='\r' /* || ch=='\n' */ );

	if (ch=='\t') ch = ' ';

	last_returned_ch = ch;
	return ch;
}


/* local to read_word */
static char *input_str = NULL;
static unsigned long current_max_length = 1;


/*========================================================================
 * Name:	expand_word_buffer
 * Purpose:	Expands the buffer used to store an incoming word.
 *		This allows us to remove the limit on word length.
 * Args:	None.
 * Returns:	None.
 *=======================================================================*/

static int
expand_word_buffer ()
{
	char *new_ptr;
	unsigned long old_length;
	if (!input_str)
		error_handler ("no input buffer allocated");
	old_length = current_max_length;
	current_max_length *= 2;
	new_ptr = my_malloc (current_max_length);
	if (!new_ptr)
		error_handler ("out of memory while resizing buffer");
	
	memcpy (new_ptr, input_str, old_length);
	my_free (input_str);
	input_str = new_ptr;
	return TRUE;
}


/*========================================================================
 * Name:	read_word
 * Purpose:	The core of the parser, this reads a word.
 * Args:	Input file.
 * Returns:	Number of characters in the word, or zero.
 * Note:	The word buffer is static and local to this file.
 *=======================================================================*/

static int 
read_word (FILE *f) 
{
	int ch, ch2, ix=0;
	int have_whitespace=FALSE;
	int is_control_word=FALSE;
	int has_numeric_param=FALSE; /* if is_control_word==TRUE */
	int need_unget=FALSE;

	CHECK_PARAM_NOT_NULL(f);

	current_max_length = 10; /* XX */

	/* Get some storage for a word.
	 */
	input_str = my_malloc (current_max_length);
	if (!input_str)
		error_handler("cannot allocate word storage");

	do {
		ch = my_getchar(f);
	} 
	while (ch=='\n');

	if (ch==' ')
	{
		/* Compress multiple space chars down to one.
		 */
		while (ch == ' ') {
			ch = my_getchar(f);
			have_whitespace=TRUE;
		}
		if (have_whitespace) {
			my_unget_char (ch);
			input_str[0]=' '; 
			input_str[1]=0;
			return 1;
		}
	}

	switch(ch) 
	{
	case EOF: 
		return 0;

	case '\\':
		ch2 = my_getchar(f);

		/* Look for two-character command words.
		 */
		switch (ch2) 
		{
		case '\n':
			strcpy (input_str, "\\par");
			return 4;
		case '~':
		case '{':
		case '}':
		case '\\':
		case '_':
		case '-':
			input_str[0] = '\\';
			input_str[1] = ch2;
			input_str[2] = 0;
			return 2;
		case '\'':
			/* Preserve \'## expressions (hex char exprs) for later.
			 */
			input_str[0]='\\'; 
			input_str[1]='\'';
			ix=2;
			if(ix==current_max_length) {
				if (!expand_word_buffer ())
					error_handler("word too long");
			}
			ch = my_getchar(f);
			input_str[ix++]=ch;
			if(ix==current_max_length) {
				if (!expand_word_buffer ())
					error_handler("word too long");
			}
			ch = my_getchar(f);
			input_str[ix++]=ch;
			if(ix==current_max_length) {
				if (!expand_word_buffer ())
					error_handler("word too long");
			}
			input_str[ix]=0;
			return ix;
		}

		is_control_word=TRUE;
		ix=1;
		input_str[0]=ch;
		ch=ch2;
		break;

	case '\t':
		/* In RTF, a tab char is the same as \tab.
		 */
		strcpy (input_str, "\\tab");
		return 4;

	case '{':
	case '}':
	case ';':
		input_str[0]=ch; 
		input_str[1]=0;
		return 1;

	}

	while (ch!=EOF)
	{
		/* Several chars always ends a word, and we need to save them.
		 */
		if (ch=='\t' || ch=='{' || ch=='}' || ch=='\\') {
			need_unget=TRUE;
			break;
		}

		/* A newline always ends a command word; we don't save it. 
		 * A newline is ignored if this is not a command word.
		 */
		if (ch=='\n') { 
			if (is_control_word) 
				break;
			ch = my_getchar(f); 
			continue; 
		}

		/* A semicolon always ends a command word; we do save it. 
		 * A semicolon never ends a regular word.
		 */
		if (ch==';') {
			if (is_control_word) {
				need_unget=TRUE;
				break;
			}
		}

		/* In this parser, a space character terminates
		 * any word, and if it does not follow a command,
		 * then it is a word in itself.
		 */
		if (ch==' ') {
			if (!is_control_word)
				need_unget=TRUE;
			break;
		}

		/* Identify a control word's numeric parameter.
		 */
		if (is_control_word) {
			if (!has_numeric_param && (isdigit(ch) || ch=='-'))
				has_numeric_param = TRUE;
			else
			if (has_numeric_param && !isdigit(ch)) {
				if (ch!=' ')
					need_unget=TRUE;
				break;
			}
		}
		
		input_str[ix++] = ch;
		if (ix==current_max_length) {
			if (!expand_word_buffer ())
				error_handler("word too long");
		}
		ch = my_getchar (f);
	}

	if (need_unget)
		my_unget_char(ch);

	input_str[ix]=0;
	return ix;
}


/*========================================================================
 * Name:	word_read
 * Purpose:	This is the recursive metareader which pieces together the 
 *		structure of Word objects.
 * Args:	Input file.
 * Returns:	Tree of Word objects.
 *=======================================================================*/

Word *
word_read (FILE* f) {
	Word * prev_word = NULL;
	Word * first_word = NULL;
	Word * new_word = NULL; /* temp */

	CHECK_PARAM_NOT_NULL(f);

	do {
		if (!read_word(f)) {
			return first_word;
		}


		if (input_str[0] == '{') {
			/* Process subwords */

#if 0
printf ("processing subword...\n");
#endif

			/* Create a dummy word to point to a sublist */
			new_word = word_new (NULL);
			if (!new_word)
				error_handler ("cannot allocate word");

			/* Get the sublist */
			new_word->child = word_read (f);
			if (!new_word->hash_index && !new_word->child)
			{
				/* printf ("unable to read children!\n"); */
			}

		} else if (input_str[0] == '}') {
#if 0
printf ("returning from word_read.\n");
#endif
			return first_word;
		} else {
			new_word = word_new (input_str);
		}

		if (prev_word) prev_word->next = new_word;

		if (!first_word) first_word = new_word;

		prev_word = new_word;

		/* Free up the memory allocated by read_word.
		 */
		my_free (input_str);
		input_str = NULL;
	}
	while(1);

}
Upload GNU unRTF git-svn-id: svn://kolibrios.org@8335 a494cfbc-eb01-0410-851d-a64ba20cac60 2020-12-07 10:06:07 +01:00
			`/*=============================================================================`
			`GNU UnRTF, a command-line program to convert RTF documents to other formats.`
			`Copyright (C) 2000,2001 Zachary Thayer Smith`

			`This program is free software; you can redistribute it and/or modify`
			`it under the terms of the GNU General Public License as published by`
			`the Free Software Foundation; either version 2 of the License, or`
			`(at your option) any later version.`

			`This program is distributed in the hope that it will be useful,`
			`but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`GNU General Public License for more details.`

			`You should have received a copy of the GNU General Public License`
			`along with this program; if not, write to the Free Software`
			`Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA`

			`The author is reachable by electronic mail at tuorfa@yahoo.com.`
			`=============================================================================*/`


			`/*----------------------------------------------------------------------`
			`* Module name: parse`
			`* Author name: Zach Smith`
			`* Create date: 01 Sep 00`
			`* Purpose: Parsing of the RTF file into a structure of Word objects.`
			`*----------------------------------------------------------------------`
			`* Changes:`
			`* 15 Oct 00, tuorfa@yahoo.com: parse.c created with functions taken from word.c`
			`* 15 Oct 00, tuorfa@yahoo.com: backslash before newline is now \par`
			`* 08 Apr 01, tuorfa@yahoo.com: removed limit on word length`
			`* 03 Aug 01, tuorfa@yahoo.com: added input buffering`
			`* 19 Sep 01, tuorfa@yahoo.com: cleaned up read_word()`
			`* 22 Sep 01, tuorfa@yahoo.com: moved word_dump() to word.c`
			`* 22 Sep 01, tuorfa@yahoo.com: added function-level comment blocks`
			`--------------------------------------------------------------------/`

			`#include <stdio.h>`
			`#include <stdlib.h>`
			`#include <ctype.h>`
			`#include <string.h>`

			`#include "defs.h"`
			`#include "parse.h"`
			`#include "malloc.h"`
			`#include "main.h"`
			`#include "error.h"`
			`#include "word.h"`
			`#include "hash.h"`



			`/* local to getchar stuff */`
			`static int ungot_char=-1;`
			`static int ungot_char2=-1;`
			`static int ungot_char3=-1;`



			`/*========================================================================`
			`* Name: my_unget_char`
			`* Purpose: My own unget routine, handling up to 3 ungot characters.`
			`* Args: Character.`
			`* Returns: None.`
			`=======================================================================/`

			`static void my_unget_char (int ch)`
			`{`
			`if (ungot_char>=0 && ungot_char2>=0 && ungot_char3>=0)`
			`error_handler ("more than 3 ungot chars");`

			`ungot_char3 = ungot_char2;`
			`ungot_char2 = ungot_char;`
			`ungot_char = ch;`
			`}`


			`static int last_returned_ch=0;`


			`#define READ_BUF_LEN 2048`
			`static int buffer_size = 0;`
			`static char *read_buf = NULL;`
			`static int read_buf_end = 0;`
			`static int read_buf_index = 0;`





			`/*========================================================================`
			`* Name: my_getchar`
			`* Purpose: Gets a character: either an ungot one, or a buffered one.`
			`* Args: Input file.`
			`* Returns: Character, or EOF.`
			`=======================================================================/`

			`static int my_getchar (FILE* f)`
			`{`
			`int ch;`

			`CHECK_PARAM_NOT_NULL(f);`

			`if (ungot_char>=0) {`
			`ch = ungot_char;`
			`ungot_char=ungot_char2;`
			`ungot_char2=ungot_char3;`
			`ungot_char3=-1;`
			`last_returned_ch = ch;`
			`return ch;`
			`}`
			`do {`
			`if (read_buf_index >= read_buf_end) {`
			`if (!read_buf) {`
			`buffer_size = READ_BUF_LEN;`
			`read_buf = my_malloc (buffer_size);`
			`if (!read_buf) {`
			`buffer_size /= 4;`
			`read_buf = my_malloc (buffer_size);`
			`if (!read_buf)`
			`error_handler ("cannot allocate read buffer");`
			`}`
			`}`
			`read_buf_end = fread (read_buf, 1, buffer_size, f);`
			`read_buf_index = 0;`
			`if (!read_buf_end)`
			`return EOF;`
			`}`
			`ch = read_buf [read_buf_index++];`

			`if (ch=='\n') {`
			`lineno++;`
			`/* Convert \(newline) into \par here */`
			`if (last_returned_ch=='\\') {`
			`my_unget_char (' ');`
			`my_unget_char ('r');`
			`my_unget_char ('a');`
			`ch = 'p';`
			`break;`
			`}`
			`}`
			`}`
			`while (ch=='\r' /* \|\| ch=='\n' */ );`

			`if (ch=='\t') ch = ' ';`

			`last_returned_ch = ch;`
			`return ch;`
			`}`


			`/* local to read_word */`
			`static char *input_str = NULL;`
			`static unsigned long current_max_length = 1;`



			`/*========================================================================`
			`* Name: expand_word_buffer`
			`* Purpose: Expands the buffer used to store an incoming word.`
			`* This allows us to remove the limit on word length.`
			`* Args: None.`
			`* Returns: None.`
			`=======================================================================/`

			`static int`
			`expand_word_buffer ()`
			`{`
			`char *new_ptr;`
			`unsigned long old_length;`
			`if (!input_str)`
			`error_handler ("no input buffer allocated");`
			`old_length = current_max_length;`
			`current_max_length *= 2;`
			`new_ptr = my_malloc (current_max_length);`
			`if (!new_ptr)`
			`error_handler ("out of memory while resizing buffer");`

			`memcpy (new_ptr, input_str, old_length);`
			`my_free (input_str);`
			`input_str = new_ptr;`
			`return TRUE;`
			`}`




			`/*========================================================================`
			`* Name: read_word`
			`* Purpose: The core of the parser, this reads a word.`
			`* Args: Input file.`
			`* Returns: Number of characters in the word, or zero.`
			`* Note: The word buffer is static and local to this file.`
			`=======================================================================/`

			`static int`
			`read_word (FILE *f)`
			`{`
			`int ch, ch2, ix=0;`
			`int have_whitespace=FALSE;`
			`int is_control_word=FALSE;`
			`int has_numeric_param=FALSE; /* if is_control_word==TRUE */`
			`int need_unget=FALSE;`

			`CHECK_PARAM_NOT_NULL(f);`

			`current_max_length = 10; /* XX */`

			`/* Get some storage for a word.`
			`*/`
			`input_str = my_malloc (current_max_length);`
			`if (!input_str)`
			`error_handler("cannot allocate word storage");`

			`do {`
			`ch = my_getchar(f);`
			`}`
			`while (ch=='\n');`

			`if (ch==' ')`
			`{`
			`/* Compress multiple space chars down to one.`
			`*/`
			`while (ch == ' ') {`
			`ch = my_getchar(f);`
			`have_whitespace=TRUE;`
			`}`
			`if (have_whitespace) {`
			`my_unget_char (ch);`
			`input_str[0]=' ';`
			`input_str[1]=0;`
			`return 1;`
			`}`
			`}`

			`switch(ch)`
			`{`
			`case EOF:`
			`return 0;`

			`case '\\':`
			`ch2 = my_getchar(f);`

			`/* Look for two-character command words.`
			`*/`
			`switch (ch2)`
			`{`
			`case '\n':`
			`strcpy (input_str, "\\par");`
			`return 4;`
			`case '~':`
			`case '{':`
			`case '}':`
			`case '\\':`
			`case '_':`
			`case '-':`
			`input_str[0] = '\\';`
			`input_str[1] = ch2;`
			`input_str[2] = 0;`
			`return 2;`
			`case '\'':`
			`/* Preserve \'## expressions (hex char exprs) for later.`
			`*/`
			`input_str[0]='\\';`
			`input_str[1]='\'';`
			`ix=2;`
			`if(ix==current_max_length) {`
			`if (!expand_word_buffer ())`
			`error_handler("word too long");`
			`}`
			`ch = my_getchar(f);`
			`input_str[ix++]=ch;`
			`if(ix==current_max_length) {`
			`if (!expand_word_buffer ())`
			`error_handler("word too long");`
			`}`
			`ch = my_getchar(f);`
			`input_str[ix++]=ch;`
			`if(ix==current_max_length) {`
			`if (!expand_word_buffer ())`
			`error_handler("word too long");`
			`}`
			`input_str[ix]=0;`
			`return ix;`
			`}`

			`is_control_word=TRUE;`
			`ix=1;`
			`input_str[0]=ch;`
			`ch=ch2;`
			`break;`

			`case '\t':`
			`/* In RTF, a tab char is the same as \tab.`
			`*/`
			`strcpy (input_str, "\\tab");`
			`return 4;`

			`case '{':`
			`case '}':`
			`case ';':`
			`input_str[0]=ch;`
			`input_str[1]=0;`
			`return 1;`

			`}`

			`while (ch!=EOF)`
			`{`
			`/* Several chars always ends a word, and we need to save them.`
			`*/`
			`if (ch=='\t' \|\| ch=='{' \|\| ch=='}' \|\| ch=='\\') {`
			`need_unget=TRUE;`
			`break;`
			`}`

			`/* A newline always ends a command word; we don't save it.`
			`* A newline is ignored if this is not a command word.`
			`*/`
			`if (ch=='\n') {`
			`if (is_control_word)`
			`break;`
			`ch = my_getchar(f);`
			`continue;`
			`}`

			`/* A semicolon always ends a command word; we do save it.`
			`* A semicolon never ends a regular word.`
			`*/`
			`if (ch==';') {`
			`if (is_control_word) {`
			`need_unget=TRUE;`
			`break;`
			`}`
			`}`

			`/* In this parser, a space character terminates`
			`* any word, and if it does not follow a command,`
			`* then it is a word in itself.`
			`*/`
			`if (ch==' ') {`
			`if (!is_control_word)`
			`need_unget=TRUE;`
			`break;`
			`}`

			`/* Identify a control word's numeric parameter.`
			`*/`
			`if (is_control_word) {`
			`if (!has_numeric_param && (isdigit(ch) \|\| ch=='-'))`
			`has_numeric_param = TRUE;`
			`else`
			`if (has_numeric_param && !isdigit(ch)) {`
			`if (ch!=' ')`
			`need_unget=TRUE;`
			`break;`
			`}`
			`}`

			`input_str[ix++] = ch;`
			`if (ix==current_max_length) {`
			`if (!expand_word_buffer ())`
			`error_handler("word too long");`
			`}`
			`ch = my_getchar (f);`
			`}`

			`if (need_unget)`
			`my_unget_char(ch);`

			`input_str[ix]=0;`
			`return ix;`
			`}`



			`/*========================================================================`
			`* Name: word_read`
			`* Purpose: This is the recursive metareader which pieces together the`
			`* structure of Word objects.`
			`* Args: Input file.`
			`* Returns: Tree of Word objects.`
			`=======================================================================/`

			`Word *`
			`word_read (FILE* f) {`
			`Word * prev_word = NULL;`
			`Word * first_word = NULL;`
			`Word * new_word = NULL; /* temp */`

			`CHECK_PARAM_NOT_NULL(f);`

			`do {`
			`if (!read_word(f)) {`
			`return first_word;`
			`}`


			`if (input_str[0] == '{') {`
			`/* Process subwords */`

			`#if 0`
			`printf ("processing subword...\n");`
			`#endif`

			`/* Create a dummy word to point to a sublist */`
			`new_word = word_new (NULL);`
			`if (!new_word)`
			`error_handler ("cannot allocate word");`

			`/* Get the sublist */`
			`new_word->child = word_read (f);`
			`if (!new_word->hash_index && !new_word->child)`
			`{`
			`/* printf ("unable to read children!\n"); */`
			`}`

			`} else if (input_str[0] == '}') {`
			`#if 0`
			`printf ("returning from word_read.\n");`
			`#endif`
			`return first_word;`
			`} else {`
			`new_word = word_new (input_str);`
			`}`

			`if (prev_word) prev_word->next = new_word;`

			`if (!first_word) first_word = new_word;`

			`prev_word = new_word;`

			`/* Free up the memory allocated by read_word.`
			`*/`
			`my_free (input_str);`
			`input_str = NULL;`
			`}`
			`while(1);`

			`}`