445 lines
9.8 KiB
C
Raw Normal View History

/*=============================================================================
GNU UnRTF, a command-line program to convert RTF documents to other formats.
Copyright (C) 2000,2001 Zachary Thayer Smith
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
The author is reachable by electronic mail at tuorfa@yahoo.com.
=============================================================================*/
/*----------------------------------------------------------------------
* Module name: parse
* Author name: Zach Smith
* Create date: 01 Sep 00
* Purpose: Parsing of the RTF file into a structure of Word objects.
*----------------------------------------------------------------------
* Changes:
* 15 Oct 00, tuorfa@yahoo.com: parse.c created with functions taken from word.c
* 15 Oct 00, tuorfa@yahoo.com: backslash before newline is now \par
* 08 Apr 01, tuorfa@yahoo.com: removed limit on word length
* 03 Aug 01, tuorfa@yahoo.com: added input buffering
* 19 Sep 01, tuorfa@yahoo.com: cleaned up read_word()
* 22 Sep 01, tuorfa@yahoo.com: moved word_dump() to word.c
* 22 Sep 01, tuorfa@yahoo.com: added function-level comment blocks
*--------------------------------------------------------------------*/
#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include "defs.h"
#include "parse.h"
#include "malloc.h"
#include "main.h"
#include "error.h"
#include "word.h"
#include "hash.h"
/* local to getchar stuff */
static int ungot_char=-1;
static int ungot_char2=-1;
static int ungot_char3=-1;
/*========================================================================
* Name: my_unget_char
* Purpose: My own unget routine, handling up to 3 ungot characters.
* Args: Character.
* Returns: None.
*=======================================================================*/
static void my_unget_char (int ch)
{
if (ungot_char>=0 && ungot_char2>=0 && ungot_char3>=0)
error_handler ("more than 3 ungot chars");
ungot_char3 = ungot_char2;
ungot_char2 = ungot_char;
ungot_char = ch;
}
static int last_returned_ch=0;
#define READ_BUF_LEN 2048
static int buffer_size = 0;
static char *read_buf = NULL;
static int read_buf_end = 0;
static int read_buf_index = 0;
/*========================================================================
* Name: my_getchar
* Purpose: Gets a character: either an ungot one, or a buffered one.
* Args: Input file.
* Returns: Character, or EOF.
*=======================================================================*/
static int my_getchar (FILE* f)
{
int ch;
CHECK_PARAM_NOT_NULL(f);
if (ungot_char>=0) {
ch = ungot_char;
ungot_char=ungot_char2;
ungot_char2=ungot_char3;
ungot_char3=-1;
last_returned_ch = ch;
return ch;
}
do {
if (read_buf_index >= read_buf_end) {
if (!read_buf) {
buffer_size = READ_BUF_LEN;
read_buf = my_malloc (buffer_size);
if (!read_buf) {
buffer_size /= 4;
read_buf = my_malloc (buffer_size);
if (!read_buf)
error_handler ("cannot allocate read buffer");
}
}
read_buf_end = fread (read_buf, 1, buffer_size, f);
read_buf_index = 0;
if (!read_buf_end)
return EOF;
}
ch = read_buf [read_buf_index++];
if (ch=='\n') {
lineno++;
/* Convert \(newline) into \par here */
if (last_returned_ch=='\\') {
my_unget_char (' ');
my_unget_char ('r');
my_unget_char ('a');
ch = 'p';
break;
}
}
}
while (ch=='\r' /* || ch=='\n' */ );
if (ch=='\t') ch = ' ';
last_returned_ch = ch;
return ch;
}
/* local to read_word */
static char *input_str = NULL;
static unsigned long current_max_length = 1;
/*========================================================================
* Name: expand_word_buffer
* Purpose: Expands the buffer used to store an incoming word.
* This allows us to remove the limit on word length.
* Args: None.
* Returns: None.
*=======================================================================*/
static int
expand_word_buffer ()
{
char *new_ptr;
unsigned long old_length;
if (!input_str)
error_handler ("no input buffer allocated");
old_length = current_max_length;
current_max_length *= 2;
new_ptr = my_malloc (current_max_length);
if (!new_ptr)
error_handler ("out of memory while resizing buffer");
memcpy (new_ptr, input_str, old_length);
my_free (input_str);
input_str = new_ptr;
return TRUE;
}
/*========================================================================
* Name: read_word
* Purpose: The core of the parser, this reads a word.
* Args: Input file.
* Returns: Number of characters in the word, or zero.
* Note: The word buffer is static and local to this file.
*=======================================================================*/
static int
read_word (FILE *f)
{
int ch, ch2, ix=0;
int have_whitespace=FALSE;
int is_control_word=FALSE;
int has_numeric_param=FALSE; /* if is_control_word==TRUE */
int need_unget=FALSE;
CHECK_PARAM_NOT_NULL(f);
current_max_length = 10; /* XX */
/* Get some storage for a word.
*/
input_str = my_malloc (current_max_length);
if (!input_str)
error_handler("cannot allocate word storage");
do {
ch = my_getchar(f);
}
while (ch=='\n');
if (ch==' ')
{
/* Compress multiple space chars down to one.
*/
while (ch == ' ') {
ch = my_getchar(f);
have_whitespace=TRUE;
}
if (have_whitespace) {
my_unget_char (ch);
input_str[0]=' ';
input_str[1]=0;
return 1;
}
}
switch(ch)
{
case EOF:
return 0;
case '\\':
ch2 = my_getchar(f);
/* Look for two-character command words.
*/
switch (ch2)
{
case '\n':
strcpy (input_str, "\\par");
return 4;
case '~':
case '{':
case '}':
case '\\':
case '_':
case '-':
input_str[0] = '\\';
input_str[1] = ch2;
input_str[2] = 0;
return 2;
case '\'':
/* Preserve \'## expressions (hex char exprs) for later.
*/
input_str[0]='\\';
input_str[1]='\'';
ix=2;
if(ix==current_max_length) {
if (!expand_word_buffer ())
error_handler("word too long");
}
ch = my_getchar(f);
input_str[ix++]=ch;
if(ix==current_max_length) {
if (!expand_word_buffer ())
error_handler("word too long");
}
ch = my_getchar(f);
input_str[ix++]=ch;
if(ix==current_max_length) {
if (!expand_word_buffer ())
error_handler("word too long");
}
input_str[ix]=0;
return ix;
}
is_control_word=TRUE;
ix=1;
input_str[0]=ch;
ch=ch2;
break;
case '\t':
/* In RTF, a tab char is the same as \tab.
*/
strcpy (input_str, "\\tab");
return 4;
case '{':
case '}':
case ';':
input_str[0]=ch;
input_str[1]=0;
return 1;
}
while (ch!=EOF)
{
/* Several chars always ends a word, and we need to save them.
*/
if (ch=='\t' || ch=='{' || ch=='}' || ch=='\\') {
need_unget=TRUE;
break;
}
/* A newline always ends a command word; we don't save it.
* A newline is ignored if this is not a command word.
*/
if (ch=='\n') {
if (is_control_word)
break;
ch = my_getchar(f);
continue;
}
/* A semicolon always ends a command word; we do save it.
* A semicolon never ends a regular word.
*/
if (ch==';') {
if (is_control_word) {
need_unget=TRUE;
break;
}
}
/* In this parser, a space character terminates
* any word, and if it does not follow a command,
* then it is a word in itself.
*/
if (ch==' ') {
if (!is_control_word)
need_unget=TRUE;
break;
}
/* Identify a control word's numeric parameter.
*/
if (is_control_word) {
if (!has_numeric_param && (isdigit(ch) || ch=='-'))
has_numeric_param = TRUE;
else
if (has_numeric_param && !isdigit(ch)) {
if (ch!=' ')
need_unget=TRUE;
break;
}
}
input_str[ix++] = ch;
if (ix==current_max_length) {
if (!expand_word_buffer ())
error_handler("word too long");
}
ch = my_getchar (f);
}
if (need_unget)
my_unget_char(ch);
input_str[ix]=0;
return ix;
}
/*========================================================================
* Name: word_read
* Purpose: This is the recursive metareader which pieces together the
* structure of Word objects.
* Args: Input file.
* Returns: Tree of Word objects.
*=======================================================================*/
Word *
word_read (FILE* f) {
Word * prev_word = NULL;
Word * first_word = NULL;
Word * new_word = NULL; /* temp */
CHECK_PARAM_NOT_NULL(f);
do {
if (!read_word(f)) {
return first_word;
}
if (input_str[0] == '{') {
/* Process subwords */
#if 0
printf ("processing subword...\n");
#endif
/* Create a dummy word to point to a sublist */
new_word = word_new (NULL);
if (!new_word)
error_handler ("cannot allocate word");
/* Get the sublist */
new_word->child = word_read (f);
if (!new_word->hash_index && !new_word->child)
{
/* printf ("unable to read children!\n"); */
}
} else if (input_str[0] == '}') {
#if 0
printf ("returning from word_read.\n");
#endif
return first_word;
} else {
new_word = word_new (input_str);
}
if (prev_word) prev_word->next = new_word;
if (!first_word) first_word = new_word;
prev_word = new_word;
/* Free up the memory allocated by read_word.
*/
my_free (input_str);
input_str = NULL;
}
while(1);
}