464 lines
9.1 KiB
C
464 lines
9.1 KiB
C
|
#include "fitz.h"
|
||
|
#include "mupdf.h"
|
||
|
|
||
|
/* Scan file for objects and reconstruct xref table */
|
||
|
|
||
|
struct entry
|
||
|
{
|
||
|
int num;
|
||
|
int gen;
|
||
|
int ofs;
|
||
|
int stm_ofs;
|
||
|
int stm_len;
|
||
|
};
|
||
|
|
||
|
static fz_error
|
||
|
pdf_repair_obj(fz_stream *file, char *buf, int cap, int *stmofsp, int *stmlenp, fz_obj **encrypt, fz_obj **id)
|
||
|
{
|
||
|
fz_error error;
|
||
|
int tok;
|
||
|
int stm_len;
|
||
|
int len;
|
||
|
int n;
|
||
|
|
||
|
*stmofsp = 0;
|
||
|
*stmlenp = -1;
|
||
|
|
||
|
stm_len = 0;
|
||
|
|
||
|
error = pdf_lex(&tok, file, buf, cap, &len);
|
||
|
if (error)
|
||
|
return fz_rethrow(error, "cannot parse object");
|
||
|
if (tok == PDF_TOK_OPEN_DICT)
|
||
|
{
|
||
|
fz_obj *dict, *obj;
|
||
|
|
||
|
/* Send NULL xref so we don't try to resolve references */
|
||
|
error = pdf_parse_dict(&dict, NULL, file, buf, cap);
|
||
|
if (error)
|
||
|
return fz_rethrow(error, "cannot parse object");
|
||
|
|
||
|
obj = fz_dict_gets(dict, "Type");
|
||
|
if (fz_is_name(obj) && !strcmp(fz_to_name(obj), "XRef"))
|
||
|
{
|
||
|
obj = fz_dict_gets(dict, "Encrypt");
|
||
|
if (obj)
|
||
|
{
|
||
|
if (*encrypt)
|
||
|
fz_drop_obj(*encrypt);
|
||
|
*encrypt = fz_keep_obj(obj);
|
||
|
}
|
||
|
|
||
|
obj = fz_dict_gets(dict, "ID");
|
||
|
if (obj)
|
||
|
{
|
||
|
if (*id)
|
||
|
fz_drop_obj(*id);
|
||
|
*id = fz_keep_obj(obj);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
obj = fz_dict_gets(dict, "Length");
|
||
|
if (fz_is_int(obj))
|
||
|
stm_len = fz_to_int(obj);
|
||
|
|
||
|
fz_drop_obj(dict);
|
||
|
}
|
||
|
|
||
|
while ( tok != PDF_TOK_STREAM &&
|
||
|
tok != PDF_TOK_ENDOBJ &&
|
||
|
tok != PDF_TOK_ERROR &&
|
||
|
tok != PDF_TOK_EOF )
|
||
|
{
|
||
|
error = pdf_lex(&tok, file, buf, cap, &len);
|
||
|
if (error)
|
||
|
return fz_rethrow(error, "cannot scan for endobj or stream token");
|
||
|
}
|
||
|
|
||
|
if (tok == PDF_TOK_STREAM)
|
||
|
{
|
||
|
int c = fz_read_byte(file);
|
||
|
if (c == '\r') {
|
||
|
c = fz_peek_byte(file);
|
||
|
if (c == '\n')
|
||
|
fz_read_byte(file);
|
||
|
}
|
||
|
|
||
|
*stmofsp = fz_tell(file);
|
||
|
if (*stmofsp < 0)
|
||
|
return fz_throw("cannot seek in file");
|
||
|
|
||
|
if (stm_len > 0)
|
||
|
{
|
||
|
fz_seek(file, *stmofsp + stm_len, 0);
|
||
|
error = pdf_lex(&tok, file, buf, cap, &len);
|
||
|
if (error)
|
||
|
fz_catch(error, "cannot find endstream token, falling back to scanning");
|
||
|
if (tok == PDF_TOK_ENDSTREAM)
|
||
|
goto atobjend;
|
||
|
fz_seek(file, *stmofsp, 0);
|
||
|
}
|
||
|
|
||
|
n = fz_read(file, (unsigned char *) buf, 9);
|
||
|
if (n < 0)
|
||
|
return fz_rethrow(n, "cannot read from file");
|
||
|
|
||
|
while (memcmp(buf, "endstream", 9) != 0)
|
||
|
{
|
||
|
c = fz_read_byte(file);
|
||
|
if (c == EOF)
|
||
|
break;
|
||
|
memmove(buf, buf + 1, 8);
|
||
|
buf[8] = c;
|
||
|
}
|
||
|
|
||
|
*stmlenp = fz_tell(file) - *stmofsp - 9;
|
||
|
|
||
|
atobjend:
|
||
|
error = pdf_lex(&tok, file, buf, cap, &len);
|
||
|
if (error)
|
||
|
return fz_rethrow(error, "cannot scan for endobj token");
|
||
|
if (tok != PDF_TOK_ENDOBJ)
|
||
|
fz_warn("object missing 'endobj' token");
|
||
|
}
|
||
|
|
||
|
return fz_okay;
|
||
|
}
|
||
|
|
||
|
static fz_error
|
||
|
pdf_repair_obj_stm(pdf_xref *xref, int num, int gen)
|
||
|
{
|
||
|
fz_error error;
|
||
|
fz_obj *obj;
|
||
|
fz_stream *stm;
|
||
|
int tok;
|
||
|
int i, n, count;
|
||
|
char buf[256];
|
||
|
|
||
|
error = pdf_load_object(&obj, xref, num, gen);
|
||
|
if (error)
|
||
|
return fz_rethrow(error, "cannot load object stream object (%d %d R)", num, gen);
|
||
|
|
||
|
count = fz_to_int(fz_dict_gets(obj, "N"));
|
||
|
|
||
|
fz_drop_obj(obj);
|
||
|
|
||
|
error = pdf_open_stream(&stm, xref, num, gen);
|
||
|
if (error)
|
||
|
return fz_rethrow(error, "cannot open object stream object (%d %d R)", num, gen);
|
||
|
|
||
|
for (i = 0; i < count; i++)
|
||
|
{
|
||
|
error = pdf_lex(&tok, stm, buf, sizeof buf, &n);
|
||
|
if (error || tok != PDF_TOK_INT)
|
||
|
{
|
||
|
fz_close(stm);
|
||
|
return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen);
|
||
|
}
|
||
|
|
||
|
n = atoi(buf);
|
||
|
if (n >= xref->len)
|
||
|
pdf_resize_xref(xref, n + 1);
|
||
|
|
||
|
xref->table[n].ofs = num;
|
||
|
xref->table[n].gen = i;
|
||
|
xref->table[n].stm_ofs = 0;
|
||
|
xref->table[n].obj = NULL;
|
||
|
xref->table[n].type = 'o';
|
||
|
|
||
|
error = pdf_lex(&tok, stm, buf, sizeof buf, &n);
|
||
|
if (error || tok != PDF_TOK_INT)
|
||
|
{
|
||
|
fz_close(stm);
|
||
|
return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
fz_close(stm);
|
||
|
return fz_okay;
|
||
|
}
|
||
|
|
||
|
fz_error
|
||
|
pdf_repair_xref(pdf_xref *xref, char *buf, int bufsize)
|
||
|
{
|
||
|
fz_error error;
|
||
|
fz_obj *dict, *obj;
|
||
|
fz_obj *length;
|
||
|
|
||
|
fz_obj *encrypt = NULL;
|
||
|
fz_obj *id = NULL;
|
||
|
fz_obj *root = NULL;
|
||
|
fz_obj *info = NULL;
|
||
|
|
||
|
struct entry *list = NULL;
|
||
|
int listlen;
|
||
|
int listcap;
|
||
|
int maxnum = 0;
|
||
|
|
||
|
int num = 0;
|
||
|
int gen = 0;
|
||
|
int tmpofs, numofs = 0, genofs = 0;
|
||
|
int stm_len, stm_ofs = 0;
|
||
|
int tok;
|
||
|
int next;
|
||
|
int i, n, c;
|
||
|
|
||
|
fz_seek(xref->file, 0, 0);
|
||
|
|
||
|
listlen = 0;
|
||
|
listcap = 1024;
|
||
|
list = fz_calloc(listcap, sizeof(struct entry));
|
||
|
|
||
|
/* look for '%PDF' version marker within first kilobyte of file */
|
||
|
n = fz_read(xref->file, (unsigned char *)buf, MAX(bufsize, 1024));
|
||
|
if (n < 0)
|
||
|
{
|
||
|
error = fz_rethrow(n, "cannot read from file");
|
||
|
goto cleanup;
|
||
|
}
|
||
|
|
||
|
fz_seek(xref->file, 0, 0);
|
||
|
for (i = 0; i < n - 4; i++)
|
||
|
{
|
||
|
if (memcmp(buf + i, "%PDF", 4) == 0)
|
||
|
{
|
||
|
fz_seek(xref->file, i + 8, 0); /* skip "%PDF-X.Y" */
|
||
|
break;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* skip comment line after version marker since some generators
|
||
|
* forget to terminate the comment with a newline */
|
||
|
c = fz_read_byte(xref->file);
|
||
|
while (c >= 0 && (c == ' ' || c == '%'))
|
||
|
c = fz_read_byte(xref->file);
|
||
|
fz_unread_byte(xref->file);
|
||
|
|
||
|
while (1)
|
||
|
{
|
||
|
tmpofs = fz_tell(xref->file);
|
||
|
if (tmpofs < 0)
|
||
|
{
|
||
|
error = fz_throw("cannot tell in file");
|
||
|
goto cleanup;
|
||
|
}
|
||
|
|
||
|
error = pdf_lex(&tok, xref->file, buf, bufsize, &n);
|
||
|
if (error)
|
||
|
{
|
||
|
fz_catch(error, "ignoring the rest of the file");
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (tok == PDF_TOK_INT)
|
||
|
{
|
||
|
numofs = genofs;
|
||
|
num = gen;
|
||
|
genofs = tmpofs;
|
||
|
gen = atoi(buf);
|
||
|
}
|
||
|
|
||
|
else if (tok == PDF_TOK_OBJ)
|
||
|
{
|
||
|
error = pdf_repair_obj(xref->file, buf, bufsize, &stm_ofs, &stm_len, &encrypt, &id);
|
||
|
if (error)
|
||
|
{
|
||
|
error = fz_rethrow(error, "cannot parse object (%d %d R)", num, gen);
|
||
|
goto cleanup;
|
||
|
}
|
||
|
|
||
|
if (listlen + 1 == listcap)
|
||
|
{
|
||
|
listcap = (listcap * 3) / 2;
|
||
|
list = fz_realloc(list, listcap, sizeof(struct entry));
|
||
|
}
|
||
|
|
||
|
list[listlen].num = num;
|
||
|
list[listlen].gen = gen;
|
||
|
list[listlen].ofs = numofs;
|
||
|
list[listlen].stm_ofs = stm_ofs;
|
||
|
list[listlen].stm_len = stm_len;
|
||
|
listlen ++;
|
||
|
|
||
|
if (num > maxnum)
|
||
|
maxnum = num;
|
||
|
}
|
||
|
|
||
|
/* trailer dictionary */
|
||
|
else if (tok == PDF_TOK_OPEN_DICT)
|
||
|
{
|
||
|
error = pdf_parse_dict(&dict, xref, xref->file, buf, bufsize);
|
||
|
if (error)
|
||
|
{
|
||
|
error = fz_rethrow(error, "cannot parse object");
|
||
|
goto cleanup;
|
||
|
}
|
||
|
|
||
|
obj = fz_dict_gets(dict, "Encrypt");
|
||
|
if (obj)
|
||
|
{
|
||
|
if (encrypt)
|
||
|
fz_drop_obj(encrypt);
|
||
|
encrypt = fz_keep_obj(obj);
|
||
|
}
|
||
|
|
||
|
obj = fz_dict_gets(dict, "ID");
|
||
|
if (obj)
|
||
|
{
|
||
|
if (id)
|
||
|
fz_drop_obj(id);
|
||
|
id = fz_keep_obj(obj);
|
||
|
}
|
||
|
|
||
|
obj = fz_dict_gets(dict, "Root");
|
||
|
if (obj)
|
||
|
{
|
||
|
if (root)
|
||
|
fz_drop_obj(root);
|
||
|
root = fz_keep_obj(obj);
|
||
|
}
|
||
|
|
||
|
obj = fz_dict_gets(dict, "Info");
|
||
|
if (obj)
|
||
|
{
|
||
|
if (info)
|
||
|
fz_drop_obj(info);
|
||
|
info = fz_keep_obj(obj);
|
||
|
}
|
||
|
|
||
|
fz_drop_obj(dict);
|
||
|
}
|
||
|
|
||
|
else if (tok == PDF_TOK_ERROR)
|
||
|
fz_read_byte(xref->file);
|
||
|
|
||
|
else if (tok == PDF_TOK_EOF)
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
/* make xref reasonable */
|
||
|
|
||
|
pdf_resize_xref(xref, maxnum + 1);
|
||
|
|
||
|
for (i = 0; i < listlen; i++)
|
||
|
{
|
||
|
xref->table[list[i].num].type = 'n';
|
||
|
xref->table[list[i].num].ofs = list[i].ofs;
|
||
|
xref->table[list[i].num].gen = list[i].gen;
|
||
|
|
||
|
xref->table[list[i].num].stm_ofs = list[i].stm_ofs;
|
||
|
|
||
|
/* corrected stream length */
|
||
|
if (list[i].stm_len >= 0)
|
||
|
{
|
||
|
error = pdf_load_object(&dict, xref, list[i].num, list[i].gen);
|
||
|
if (error)
|
||
|
{
|
||
|
error = fz_rethrow(error, "cannot load stream object (%d %d R)", list[i].num, list[i].gen);
|
||
|
goto cleanup;
|
||
|
}
|
||
|
|
||
|
length = fz_new_int(list[i].stm_len);
|
||
|
fz_dict_puts(dict, "Length", length);
|
||
|
fz_drop_obj(length);
|
||
|
|
||
|
fz_drop_obj(dict);
|
||
|
}
|
||
|
|
||
|
}
|
||
|
|
||
|
xref->table[0].type = 'f';
|
||
|
xref->table[0].ofs = 0;
|
||
|
xref->table[0].gen = 65535;
|
||
|
xref->table[0].stm_ofs = 0;
|
||
|
xref->table[0].obj = NULL;
|
||
|
|
||
|
next = 0;
|
||
|
for (i = xref->len - 1; i >= 0; i--)
|
||
|
{
|
||
|
if (xref->table[i].type == 'f')
|
||
|
{
|
||
|
xref->table[i].ofs = next;
|
||
|
if (xref->table[i].gen < 65535)
|
||
|
xref->table[i].gen ++;
|
||
|
next = i;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
/* create a repaired trailer, Root will be added later */
|
||
|
|
||
|
xref->trailer = fz_new_dict(5);
|
||
|
|
||
|
obj = fz_new_int(maxnum + 1);
|
||
|
fz_dict_puts(xref->trailer, "Size", obj);
|
||
|
fz_drop_obj(obj);
|
||
|
|
||
|
if (root)
|
||
|
{
|
||
|
fz_dict_puts(xref->trailer, "Root", root);
|
||
|
fz_drop_obj(root);
|
||
|
}
|
||
|
if (info)
|
||
|
{
|
||
|
fz_dict_puts(xref->trailer, "Info", info);
|
||
|
fz_drop_obj(info);
|
||
|
}
|
||
|
|
||
|
if (encrypt)
|
||
|
{
|
||
|
if (fz_is_indirect(encrypt))
|
||
|
{
|
||
|
/* create new reference with non-NULL xref pointer */
|
||
|
obj = fz_new_indirect(fz_to_num(encrypt), fz_to_gen(encrypt), xref);
|
||
|
fz_drop_obj(encrypt);
|
||
|
encrypt = obj;
|
||
|
}
|
||
|
fz_dict_puts(xref->trailer, "Encrypt", encrypt);
|
||
|
fz_drop_obj(encrypt);
|
||
|
}
|
||
|
|
||
|
if (id)
|
||
|
{
|
||
|
if (fz_is_indirect(id))
|
||
|
{
|
||
|
/* create new reference with non-NULL xref pointer */
|
||
|
obj = fz_new_indirect(fz_to_num(id), fz_to_gen(id), xref);
|
||
|
fz_drop_obj(id);
|
||
|
id = obj;
|
||
|
}
|
||
|
fz_dict_puts(xref->trailer, "ID", id);
|
||
|
fz_drop_obj(id);
|
||
|
}
|
||
|
|
||
|
fz_free(list);
|
||
|
return fz_okay;
|
||
|
|
||
|
cleanup:
|
||
|
if (encrypt) fz_drop_obj(encrypt);
|
||
|
if (id) fz_drop_obj(id);
|
||
|
if (root) fz_drop_obj(root);
|
||
|
if (info) fz_drop_obj(info);
|
||
|
fz_free(list);
|
||
|
return error; /* already rethrown */
|
||
|
}
|
||
|
|
||
|
fz_error
|
||
|
pdf_repair_obj_stms(pdf_xref *xref)
|
||
|
{
|
||
|
fz_obj *dict;
|
||
|
int i;
|
||
|
|
||
|
for (i = 0; i < xref->len; i++)
|
||
|
{
|
||
|
if (xref->table[i].stm_ofs)
|
||
|
{
|
||
|
pdf_load_object(&dict, xref, i, 0);
|
||
|
if (!strcmp(fz_to_name(fz_dict_gets(dict, "Type")), "ObjStm"))
|
||
|
pdf_repair_obj_stm(xref, i, 0);
|
||
|
fz_drop_obj(dict);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return fz_okay;
|
||
|
}
|