kolibrios/contrib/media/updf/pdf/pdf_repair.c

464 lines
9.1 KiB
C
Raw Permalink Normal View History

#include "fitz.h"
#include "mupdf.h"
/* Scan file for objects and reconstruct xref table */
struct entry
{
int num;
int gen;
int ofs;
int stm_ofs;
int stm_len;
};
static fz_error
pdf_repair_obj(fz_stream *file, char *buf, int cap, int *stmofsp, int *stmlenp, fz_obj **encrypt, fz_obj **id)
{
fz_error error;
int tok;
int stm_len;
int len;
int n;
*stmofsp = 0;
*stmlenp = -1;
stm_len = 0;
error = pdf_lex(&tok, file, buf, cap, &len);
if (error)
return fz_rethrow(error, "cannot parse object");
if (tok == PDF_TOK_OPEN_DICT)
{
fz_obj *dict, *obj;
/* Send NULL xref so we don't try to resolve references */
error = pdf_parse_dict(&dict, NULL, file, buf, cap);
if (error)
return fz_rethrow(error, "cannot parse object");
obj = fz_dict_gets(dict, "Type");
if (fz_is_name(obj) && !strcmp(fz_to_name(obj), "XRef"))
{
obj = fz_dict_gets(dict, "Encrypt");
if (obj)
{
if (*encrypt)
fz_drop_obj(*encrypt);
*encrypt = fz_keep_obj(obj);
}
obj = fz_dict_gets(dict, "ID");
if (obj)
{
if (*id)
fz_drop_obj(*id);
*id = fz_keep_obj(obj);
}
}
obj = fz_dict_gets(dict, "Length");
if (fz_is_int(obj))
stm_len = fz_to_int(obj);
fz_drop_obj(dict);
}
while ( tok != PDF_TOK_STREAM &&
tok != PDF_TOK_ENDOBJ &&
tok != PDF_TOK_ERROR &&
tok != PDF_TOK_EOF )
{
error = pdf_lex(&tok, file, buf, cap, &len);
if (error)
return fz_rethrow(error, "cannot scan for endobj or stream token");
}
if (tok == PDF_TOK_STREAM)
{
int c = fz_read_byte(file);
if (c == '\r') {
c = fz_peek_byte(file);
if (c == '\n')
fz_read_byte(file);
}
*stmofsp = fz_tell(file);
if (*stmofsp < 0)
return fz_throw("cannot seek in file");
if (stm_len > 0)
{
fz_seek(file, *stmofsp + stm_len, 0);
error = pdf_lex(&tok, file, buf, cap, &len);
if (error)
fz_catch(error, "cannot find endstream token, falling back to scanning");
if (tok == PDF_TOK_ENDSTREAM)
goto atobjend;
fz_seek(file, *stmofsp, 0);
}
n = fz_read(file, (unsigned char *) buf, 9);
if (n < 0)
return fz_rethrow(n, "cannot read from file");
while (memcmp(buf, "endstream", 9) != 0)
{
c = fz_read_byte(file);
if (c == EOF)
break;
memmove(buf, buf + 1, 8);
buf[8] = c;
}
*stmlenp = fz_tell(file) - *stmofsp - 9;
atobjend:
error = pdf_lex(&tok, file, buf, cap, &len);
if (error)
return fz_rethrow(error, "cannot scan for endobj token");
if (tok != PDF_TOK_ENDOBJ)
fz_warn("object missing 'endobj' token");
}
return fz_okay;
}
static fz_error
pdf_repair_obj_stm(pdf_xref *xref, int num, int gen)
{
fz_error error;
fz_obj *obj;
fz_stream *stm;
int tok;
int i, n, count;
char buf[256];
error = pdf_load_object(&obj, xref, num, gen);
if (error)
return fz_rethrow(error, "cannot load object stream object (%d %d R)", num, gen);
count = fz_to_int(fz_dict_gets(obj, "N"));
fz_drop_obj(obj);
error = pdf_open_stream(&stm, xref, num, gen);
if (error)
return fz_rethrow(error, "cannot open object stream object (%d %d R)", num, gen);
for (i = 0; i < count; i++)
{
error = pdf_lex(&tok, stm, buf, sizeof buf, &n);
if (error || tok != PDF_TOK_INT)
{
fz_close(stm);
return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen);
}
n = atoi(buf);
if (n >= xref->len)
pdf_resize_xref(xref, n + 1);
xref->table[n].ofs = num;
xref->table[n].gen = i;
xref->table[n].stm_ofs = 0;
xref->table[n].obj = NULL;
xref->table[n].type = 'o';
error = pdf_lex(&tok, stm, buf, sizeof buf, &n);
if (error || tok != PDF_TOK_INT)
{
fz_close(stm);
return fz_rethrow(error, "corrupt object stream (%d %d R)", num, gen);
}
}
fz_close(stm);
return fz_okay;
}
fz_error
pdf_repair_xref(pdf_xref *xref, char *buf, int bufsize)
{
fz_error error;
fz_obj *dict, *obj;
fz_obj *length;
fz_obj *encrypt = NULL;
fz_obj *id = NULL;
fz_obj *root = NULL;
fz_obj *info = NULL;
struct entry *list = NULL;
int listlen;
int listcap;
int maxnum = 0;
int num = 0;
int gen = 0;
int tmpofs, numofs = 0, genofs = 0;
int stm_len, stm_ofs = 0;
int tok;
int next;
int i, n, c;
fz_seek(xref->file, 0, 0);
listlen = 0;
listcap = 1024;
list = fz_calloc(listcap, sizeof(struct entry));
/* look for '%PDF' version marker within first kilobyte of file */
n = fz_read(xref->file, (unsigned char *)buf, MAX(bufsize, 1024));
if (n < 0)
{
error = fz_rethrow(n, "cannot read from file");
goto cleanup;
}
fz_seek(xref->file, 0, 0);
for (i = 0; i < n - 4; i++)
{
if (memcmp(buf + i, "%PDF", 4) == 0)
{
fz_seek(xref->file, i + 8, 0); /* skip "%PDF-X.Y" */
break;
}
}
/* skip comment line after version marker since some generators
* forget to terminate the comment with a newline */
c = fz_read_byte(xref->file);
while (c >= 0 && (c == ' ' || c == '%'))
c = fz_read_byte(xref->file);
fz_unread_byte(xref->file);
while (1)
{
tmpofs = fz_tell(xref->file);
if (tmpofs < 0)
{
error = fz_throw("cannot tell in file");
goto cleanup;
}
error = pdf_lex(&tok, xref->file, buf, bufsize, &n);
if (error)
{
fz_catch(error, "ignoring the rest of the file");
break;
}
if (tok == PDF_TOK_INT)
{
numofs = genofs;
num = gen;
genofs = tmpofs;
gen = atoi(buf);
}
else if (tok == PDF_TOK_OBJ)
{
error = pdf_repair_obj(xref->file, buf, bufsize, &stm_ofs, &stm_len, &encrypt, &id);
if (error)
{
error = fz_rethrow(error, "cannot parse object (%d %d R)", num, gen);
goto cleanup;
}
if (listlen + 1 == listcap)
{
listcap = (listcap * 3) / 2;
list = fz_realloc(list, listcap, sizeof(struct entry));
}
list[listlen].num = num;
list[listlen].gen = gen;
list[listlen].ofs = numofs;
list[listlen].stm_ofs = stm_ofs;
list[listlen].stm_len = stm_len;
listlen ++;
if (num > maxnum)
maxnum = num;
}
/* trailer dictionary */
else if (tok == PDF_TOK_OPEN_DICT)
{
error = pdf_parse_dict(&dict, xref, xref->file, buf, bufsize);
if (error)
{
error = fz_rethrow(error, "cannot parse object");
goto cleanup;
}
obj = fz_dict_gets(dict, "Encrypt");
if (obj)
{
if (encrypt)
fz_drop_obj(encrypt);
encrypt = fz_keep_obj(obj);
}
obj = fz_dict_gets(dict, "ID");
if (obj)
{
if (id)
fz_drop_obj(id);
id = fz_keep_obj(obj);
}
obj = fz_dict_gets(dict, "Root");
if (obj)
{
if (root)
fz_drop_obj(root);
root = fz_keep_obj(obj);
}
obj = fz_dict_gets(dict, "Info");
if (obj)
{
if (info)
fz_drop_obj(info);
info = fz_keep_obj(obj);
}
fz_drop_obj(dict);
}
else if (tok == PDF_TOK_ERROR)
fz_read_byte(xref->file);
else if (tok == PDF_TOK_EOF)
break;
}
/* make xref reasonable */
pdf_resize_xref(xref, maxnum + 1);
for (i = 0; i < listlen; i++)
{
xref->table[list[i].num].type = 'n';
xref->table[list[i].num].ofs = list[i].ofs;
xref->table[list[i].num].gen = list[i].gen;
xref->table[list[i].num].stm_ofs = list[i].stm_ofs;
/* corrected stream length */
if (list[i].stm_len >= 0)
{
error = pdf_load_object(&dict, xref, list[i].num, list[i].gen);
if (error)
{
error = fz_rethrow(error, "cannot load stream object (%d %d R)", list[i].num, list[i].gen);
goto cleanup;
}
length = fz_new_int(list[i].stm_len);
fz_dict_puts(dict, "Length", length);
fz_drop_obj(length);
fz_drop_obj(dict);
}
}
xref->table[0].type = 'f';
xref->table[0].ofs = 0;
xref->table[0].gen = 65535;
xref->table[0].stm_ofs = 0;
xref->table[0].obj = NULL;
next = 0;
for (i = xref->len - 1; i >= 0; i--)
{
if (xref->table[i].type == 'f')
{
xref->table[i].ofs = next;
if (xref->table[i].gen < 65535)
xref->table[i].gen ++;
next = i;
}
}
/* create a repaired trailer, Root will be added later */
xref->trailer = fz_new_dict(5);
obj = fz_new_int(maxnum + 1);
fz_dict_puts(xref->trailer, "Size", obj);
fz_drop_obj(obj);
if (root)
{
fz_dict_puts(xref->trailer, "Root", root);
fz_drop_obj(root);
}
if (info)
{
fz_dict_puts(xref->trailer, "Info", info);
fz_drop_obj(info);
}
if (encrypt)
{
if (fz_is_indirect(encrypt))
{
/* create new reference with non-NULL xref pointer */
obj = fz_new_indirect(fz_to_num(encrypt), fz_to_gen(encrypt), xref);
fz_drop_obj(encrypt);
encrypt = obj;
}
fz_dict_puts(xref->trailer, "Encrypt", encrypt);
fz_drop_obj(encrypt);
}
if (id)
{
if (fz_is_indirect(id))
{
/* create new reference with non-NULL xref pointer */
obj = fz_new_indirect(fz_to_num(id), fz_to_gen(id), xref);
fz_drop_obj(id);
id = obj;
}
fz_dict_puts(xref->trailer, "ID", id);
fz_drop_obj(id);
}
fz_free(list);
return fz_okay;
cleanup:
if (encrypt) fz_drop_obj(encrypt);
if (id) fz_drop_obj(id);
if (root) fz_drop_obj(root);
if (info) fz_drop_obj(info);
fz_free(list);
return error; /* already rethrown */
}
fz_error
pdf_repair_obj_stms(pdf_xref *xref)
{
fz_obj *dict;
int i;
for (i = 0; i < xref->len; i++)
{
if (xref->table[i].stm_ofs)
{
pdf_load_object(&dict, xref, i, 0);
if (!strcmp(fz_to_name(fz_dict_gets(dict, "Type")), "ObjStm"))
pdf_repair_obj_stm(xref, i, 0);
fz_drop_obj(dict);
}
}
return fz_okay;
}