445 lines
9.8 KiB
C
445 lines
9.8 KiB
C
|
|
||
|
/*=============================================================================
|
||
|
GNU UnRTF, a command-line program to convert RTF documents to other formats.
|
||
|
Copyright (C) 2000,2001 Zachary Thayer Smith
|
||
|
|
||
|
This program is free software; you can redistribute it and/or modify
|
||
|
it under the terms of the GNU General Public License as published by
|
||
|
the Free Software Foundation; either version 2 of the License, or
|
||
|
(at your option) any later version.
|
||
|
|
||
|
This program is distributed in the hope that it will be useful,
|
||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
GNU General Public License for more details.
|
||
|
|
||
|
You should have received a copy of the GNU General Public License
|
||
|
along with this program; if not, write to the Free Software
|
||
|
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||
|
|
||
|
The author is reachable by electronic mail at tuorfa@yahoo.com.
|
||
|
=============================================================================*/
|
||
|
|
||
|
|
||
|
/*----------------------------------------------------------------------
|
||
|
* Module name: parse
|
||
|
* Author name: Zach Smith
|
||
|
* Create date: 01 Sep 00
|
||
|
* Purpose: Parsing of the RTF file into a structure of Word objects.
|
||
|
*----------------------------------------------------------------------
|
||
|
* Changes:
|
||
|
* 15 Oct 00, tuorfa@yahoo.com: parse.c created with functions taken from word.c
|
||
|
* 15 Oct 00, tuorfa@yahoo.com: backslash before newline is now \par
|
||
|
* 08 Apr 01, tuorfa@yahoo.com: removed limit on word length
|
||
|
* 03 Aug 01, tuorfa@yahoo.com: added input buffering
|
||
|
* 19 Sep 01, tuorfa@yahoo.com: cleaned up read_word()
|
||
|
* 22 Sep 01, tuorfa@yahoo.com: moved word_dump() to word.c
|
||
|
* 22 Sep 01, tuorfa@yahoo.com: added function-level comment blocks
|
||
|
*--------------------------------------------------------------------*/
|
||
|
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <ctype.h>
|
||
|
#include <string.h>
|
||
|
|
||
|
#include "defs.h"
|
||
|
#include "parse.h"
|
||
|
#include "malloc.h"
|
||
|
#include "main.h"
|
||
|
#include "error.h"
|
||
|
#include "word.h"
|
||
|
#include "hash.h"
|
||
|
|
||
|
|
||
|
|
||
|
/* local to getchar stuff */
|
||
|
static int ungot_char=-1;
|
||
|
static int ungot_char2=-1;
|
||
|
static int ungot_char3=-1;
|
||
|
|
||
|
|
||
|
|
||
|
/*========================================================================
|
||
|
* Name: my_unget_char
|
||
|
* Purpose: My own unget routine, handling up to 3 ungot characters.
|
||
|
* Args: Character.
|
||
|
* Returns: None.
|
||
|
*=======================================================================*/
|
||
|
|
||
|
static void my_unget_char (int ch)
|
||
|
{
|
||
|
if (ungot_char>=0 && ungot_char2>=0 && ungot_char3>=0)
|
||
|
error_handler ("more than 3 ungot chars");
|
||
|
|
||
|
ungot_char3 = ungot_char2;
|
||
|
ungot_char2 = ungot_char;
|
||
|
ungot_char = ch;
|
||
|
}
|
||
|
|
||
|
|
||
|
static int last_returned_ch=0;
|
||
|
|
||
|
|
||
|
#define READ_BUF_LEN 2048
|
||
|
static int buffer_size = 0;
|
||
|
static char *read_buf = NULL;
|
||
|
static int read_buf_end = 0;
|
||
|
static int read_buf_index = 0;
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
/*========================================================================
|
||
|
* Name: my_getchar
|
||
|
* Purpose: Gets a character: either an ungot one, or a buffered one.
|
||
|
* Args: Input file.
|
||
|
* Returns: Character, or EOF.
|
||
|
*=======================================================================*/
|
||
|
|
||
|
static int my_getchar (FILE* f)
|
||
|
{
|
||
|
int ch;
|
||
|
|
||
|
CHECK_PARAM_NOT_NULL(f);
|
||
|
|
||
|
if (ungot_char>=0) {
|
||
|
ch = ungot_char;
|
||
|
ungot_char=ungot_char2;
|
||
|
ungot_char2=ungot_char3;
|
||
|
ungot_char3=-1;
|
||
|
last_returned_ch = ch;
|
||
|
return ch;
|
||
|
}
|
||
|
do {
|
||
|
if (read_buf_index >= read_buf_end) {
|
||
|
if (!read_buf) {
|
||
|
buffer_size = READ_BUF_LEN;
|
||
|
read_buf = my_malloc (buffer_size);
|
||
|
if (!read_buf) {
|
||
|
buffer_size /= 4;
|
||
|
read_buf = my_malloc (buffer_size);
|
||
|
if (!read_buf)
|
||
|
error_handler ("cannot allocate read buffer");
|
||
|
}
|
||
|
}
|
||
|
read_buf_end = fread (read_buf, 1, buffer_size, f);
|
||
|
read_buf_index = 0;
|
||
|
if (!read_buf_end)
|
||
|
return EOF;
|
||
|
}
|
||
|
ch = read_buf [read_buf_index++];
|
||
|
|
||
|
if (ch=='\n') {
|
||
|
lineno++;
|
||
|
/* Convert \(newline) into \par here */
|
||
|
if (last_returned_ch=='\\') {
|
||
|
my_unget_char (' ');
|
||
|
my_unget_char ('r');
|
||
|
my_unget_char ('a');
|
||
|
ch = 'p';
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
while (ch=='\r' /* || ch=='\n' */ );
|
||
|
|
||
|
if (ch=='\t') ch = ' ';
|
||
|
|
||
|
last_returned_ch = ch;
|
||
|
return ch;
|
||
|
}
|
||
|
|
||
|
|
||
|
/* local to read_word */
|
||
|
static char *input_str = NULL;
|
||
|
static unsigned long current_max_length = 1;
|
||
|
|
||
|
|
||
|
|
||
|
/*========================================================================
|
||
|
* Name: expand_word_buffer
|
||
|
* Purpose: Expands the buffer used to store an incoming word.
|
||
|
* This allows us to remove the limit on word length.
|
||
|
* Args: None.
|
||
|
* Returns: None.
|
||
|
*=======================================================================*/
|
||
|
|
||
|
static int
|
||
|
expand_word_buffer ()
|
||
|
{
|
||
|
char *new_ptr;
|
||
|
unsigned long old_length;
|
||
|
if (!input_str)
|
||
|
error_handler ("no input buffer allocated");
|
||
|
old_length = current_max_length;
|
||
|
current_max_length *= 2;
|
||
|
new_ptr = my_malloc (current_max_length);
|
||
|
if (!new_ptr)
|
||
|
error_handler ("out of memory while resizing buffer");
|
||
|
|
||
|
memcpy (new_ptr, input_str, old_length);
|
||
|
my_free (input_str);
|
||
|
input_str = new_ptr;
|
||
|
return TRUE;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
/*========================================================================
|
||
|
* Name: read_word
|
||
|
* Purpose: The core of the parser, this reads a word.
|
||
|
* Args: Input file.
|
||
|
* Returns: Number of characters in the word, or zero.
|
||
|
* Note: The word buffer is static and local to this file.
|
||
|
*=======================================================================*/
|
||
|
|
||
|
static int
|
||
|
read_word (FILE *f)
|
||
|
{
|
||
|
int ch, ch2, ix=0;
|
||
|
int have_whitespace=FALSE;
|
||
|
int is_control_word=FALSE;
|
||
|
int has_numeric_param=FALSE; /* if is_control_word==TRUE */
|
||
|
int need_unget=FALSE;
|
||
|
|
||
|
CHECK_PARAM_NOT_NULL(f);
|
||
|
|
||
|
current_max_length = 10; /* XX */
|
||
|
|
||
|
/* Get some storage for a word.
|
||
|
*/
|
||
|
input_str = my_malloc (current_max_length);
|
||
|
if (!input_str)
|
||
|
error_handler("cannot allocate word storage");
|
||
|
|
||
|
do {
|
||
|
ch = my_getchar(f);
|
||
|
}
|
||
|
while (ch=='\n');
|
||
|
|
||
|
if (ch==' ')
|
||
|
{
|
||
|
/* Compress multiple space chars down to one.
|
||
|
*/
|
||
|
while (ch == ' ') {
|
||
|
ch = my_getchar(f);
|
||
|
have_whitespace=TRUE;
|
||
|
}
|
||
|
if (have_whitespace) {
|
||
|
my_unget_char (ch);
|
||
|
input_str[0]=' ';
|
||
|
input_str[1]=0;
|
||
|
return 1;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
switch(ch)
|
||
|
{
|
||
|
case EOF:
|
||
|
return 0;
|
||
|
|
||
|
case '\\':
|
||
|
ch2 = my_getchar(f);
|
||
|
|
||
|
/* Look for two-character command words.
|
||
|
*/
|
||
|
switch (ch2)
|
||
|
{
|
||
|
case '\n':
|
||
|
strcpy (input_str, "\\par");
|
||
|
return 4;
|
||
|
case '~':
|
||
|
case '{':
|
||
|
case '}':
|
||
|
case '\\':
|
||
|
case '_':
|
||
|
case '-':
|
||
|
input_str[0] = '\\';
|
||
|
input_str[1] = ch2;
|
||
|
input_str[2] = 0;
|
||
|
return 2;
|
||
|
case '\'':
|
||
|
/* Preserve \'## expressions (hex char exprs) for later.
|
||
|
*/
|
||
|
input_str[0]='\\';
|
||
|
input_str[1]='\'';
|
||
|
ix=2;
|
||
|
if(ix==current_max_length) {
|
||
|
if (!expand_word_buffer ())
|
||
|
error_handler("word too long");
|
||
|
}
|
||
|
ch = my_getchar(f);
|
||
|
input_str[ix++]=ch;
|
||
|
if(ix==current_max_length) {
|
||
|
if (!expand_word_buffer ())
|
||
|
error_handler("word too long");
|
||
|
}
|
||
|
ch = my_getchar(f);
|
||
|
input_str[ix++]=ch;
|
||
|
if(ix==current_max_length) {
|
||
|
if (!expand_word_buffer ())
|
||
|
error_handler("word too long");
|
||
|
}
|
||
|
input_str[ix]=0;
|
||
|
return ix;
|
||
|
}
|
||
|
|
||
|
is_control_word=TRUE;
|
||
|
ix=1;
|
||
|
input_str[0]=ch;
|
||
|
ch=ch2;
|
||
|
break;
|
||
|
|
||
|
case '\t':
|
||
|
/* In RTF, a tab char is the same as \tab.
|
||
|
*/
|
||
|
strcpy (input_str, "\\tab");
|
||
|
return 4;
|
||
|
|
||
|
case '{':
|
||
|
case '}':
|
||
|
case ';':
|
||
|
input_str[0]=ch;
|
||
|
input_str[1]=0;
|
||
|
return 1;
|
||
|
|
||
|
}
|
||
|
|
||
|
while (ch!=EOF)
|
||
|
{
|
||
|
/* Several chars always ends a word, and we need to save them.
|
||
|
*/
|
||
|
if (ch=='\t' || ch=='{' || ch=='}' || ch=='\\') {
|
||
|
need_unget=TRUE;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
/* A newline always ends a command word; we don't save it.
|
||
|
* A newline is ignored if this is not a command word.
|
||
|
*/
|
||
|
if (ch=='\n') {
|
||
|
if (is_control_word)
|
||
|
break;
|
||
|
ch = my_getchar(f);
|
||
|
continue;
|
||
|
}
|
||
|
|
||
|
/* A semicolon always ends a command word; we do save it.
|
||
|
* A semicolon never ends a regular word.
|
||
|
*/
|
||
|
if (ch==';') {
|
||
|
if (is_control_word) {
|
||
|
need_unget=TRUE;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* In this parser, a space character terminates
|
||
|
* any word, and if it does not follow a command,
|
||
|
* then it is a word in itself.
|
||
|
*/
|
||
|
if (ch==' ') {
|
||
|
if (!is_control_word)
|
||
|
need_unget=TRUE;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
/* Identify a control word's numeric parameter.
|
||
|
*/
|
||
|
if (is_control_word) {
|
||
|
if (!has_numeric_param && (isdigit(ch) || ch=='-'))
|
||
|
has_numeric_param = TRUE;
|
||
|
else
|
||
|
if (has_numeric_param && !isdigit(ch)) {
|
||
|
if (ch!=' ')
|
||
|
need_unget=TRUE;
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
input_str[ix++] = ch;
|
||
|
if (ix==current_max_length) {
|
||
|
if (!expand_word_buffer ())
|
||
|
error_handler("word too long");
|
||
|
}
|
||
|
ch = my_getchar (f);
|
||
|
}
|
||
|
|
||
|
if (need_unget)
|
||
|
my_unget_char(ch);
|
||
|
|
||
|
input_str[ix]=0;
|
||
|
return ix;
|
||
|
}
|
||
|
|
||
|
|
||
|
|
||
|
/*========================================================================
|
||
|
* Name: word_read
|
||
|
* Purpose: This is the recursive metareader which pieces together the
|
||
|
* structure of Word objects.
|
||
|
* Args: Input file.
|
||
|
* Returns: Tree of Word objects.
|
||
|
*=======================================================================*/
|
||
|
|
||
|
Word *
|
||
|
word_read (FILE* f) {
|
||
|
Word * prev_word = NULL;
|
||
|
Word * first_word = NULL;
|
||
|
Word * new_word = NULL; /* temp */
|
||
|
|
||
|
CHECK_PARAM_NOT_NULL(f);
|
||
|
|
||
|
do {
|
||
|
if (!read_word(f)) {
|
||
|
return first_word;
|
||
|
}
|
||
|
|
||
|
|
||
|
if (input_str[0] == '{') {
|
||
|
/* Process subwords */
|
||
|
|
||
|
#if 0
|
||
|
printf ("processing subword...\n");
|
||
|
#endif
|
||
|
|
||
|
/* Create a dummy word to point to a sublist */
|
||
|
new_word = word_new (NULL);
|
||
|
if (!new_word)
|
||
|
error_handler ("cannot allocate word");
|
||
|
|
||
|
/* Get the sublist */
|
||
|
new_word->child = word_read (f);
|
||
|
if (!new_word->hash_index && !new_word->child)
|
||
|
{
|
||
|
/* printf ("unable to read children!\n"); */
|
||
|
}
|
||
|
|
||
|
} else if (input_str[0] == '}') {
|
||
|
#if 0
|
||
|
printf ("returning from word_read.\n");
|
||
|
#endif
|
||
|
return first_word;
|
||
|
} else {
|
||
|
new_word = word_new (input_str);
|
||
|
}
|
||
|
|
||
|
if (prev_word) prev_word->next = new_word;
|
||
|
|
||
|
if (!first_word) first_word = new_word;
|
||
|
|
||
|
prev_word = new_word;
|
||
|
|
||
|
/* Free up the memory allocated by read_word.
|
||
|
*/
|
||
|
my_free (input_str);
|
||
|
input_str = NULL;
|
||
|
}
|
||
|
while(1);
|
||
|
|
||
|
}
|
||
|
|
||
|
|