511 lines
11 KiB
C
511 lines
11 KiB
C
/*
|
|
* The CMap data structure here is constructed on the fly by
|
|
* adding simple range-to-range mappings. Then the data structure
|
|
* is optimized to contain both range-to-range and range-to-table
|
|
* lookups.
|
|
*
|
|
* Any one-to-many mappings are inserted as one-to-table
|
|
* lookups in the beginning, and are not affected by the optimization
|
|
* stage.
|
|
*
|
|
* There is a special function to add a 256-length range-to-table mapping.
|
|
* The ranges do not have to be added in order.
|
|
*
|
|
* This code can be a lot simpler if we don't care about wasting memory,
|
|
* or can trust the parser to give us optimal mappings.
|
|
*/
|
|
|
|
#include "fitz.h"
|
|
#include "mupdf.h"
|
|
|
|
/* Macros for accessing the combined extent_flags field */
|
|
#define pdf_range_high(r) ((r)->low + ((r)->extent_flags >> 2))
|
|
#define pdf_range_flags(r) ((r)->extent_flags & 3)
|
|
#define pdf_range_set_high(r, h) \
|
|
((r)->extent_flags = (((r)->extent_flags & 3) | ((h - (r)->low) << 2)))
|
|
#define pdf_range_set_flags(r, f) \
|
|
((r)->extent_flags = (((r)->extent_flags & ~3) | f))
|
|
|
|
/*
|
|
* Allocate, destroy and simple parameters.
|
|
*/
|
|
|
|
pdf_cmap *
|
|
pdf_new_cmap(void)
|
|
{
|
|
pdf_cmap *cmap;
|
|
|
|
cmap = fz_malloc(sizeof(pdf_cmap));
|
|
cmap->refs = 1;
|
|
|
|
strcpy(cmap->cmap_name, "");
|
|
strcpy(cmap->usecmap_name, "");
|
|
cmap->usecmap = NULL;
|
|
cmap->wmode = 0;
|
|
cmap->codespace_len = 0;
|
|
|
|
cmap->rlen = 0;
|
|
cmap->rcap = 0;
|
|
cmap->ranges = NULL;
|
|
|
|
cmap->tlen = 0;
|
|
cmap->tcap = 0;
|
|
cmap->table = NULL;
|
|
|
|
return cmap;
|
|
}
|
|
|
|
pdf_cmap *
|
|
pdf_keep_cmap(pdf_cmap *cmap)
|
|
{
|
|
if (cmap->refs >= 0)
|
|
cmap->refs ++;
|
|
return cmap;
|
|
}
|
|
|
|
void
|
|
pdf_drop_cmap(pdf_cmap *cmap)
|
|
{
|
|
if (cmap->refs >= 0)
|
|
{
|
|
if (--cmap->refs == 0)
|
|
{
|
|
if (cmap->usecmap)
|
|
pdf_drop_cmap(cmap->usecmap);
|
|
fz_free(cmap->ranges);
|
|
fz_free(cmap->table);
|
|
fz_free(cmap);
|
|
}
|
|
}
|
|
}
|
|
|
|
void
|
|
pdf_set_usecmap(pdf_cmap *cmap, pdf_cmap *usecmap)
|
|
{
|
|
int i;
|
|
|
|
if (cmap->usecmap)
|
|
pdf_drop_cmap(cmap->usecmap);
|
|
cmap->usecmap = pdf_keep_cmap(usecmap);
|
|
|
|
if (cmap->codespace_len == 0)
|
|
{
|
|
cmap->codespace_len = usecmap->codespace_len;
|
|
for (i = 0; i < usecmap->codespace_len; i++)
|
|
cmap->codespace[i] = usecmap->codespace[i];
|
|
}
|
|
}
|
|
|
|
int
|
|
pdf_get_wmode(pdf_cmap *cmap)
|
|
{
|
|
return cmap->wmode;
|
|
}
|
|
|
|
void
|
|
pdf_set_wmode(pdf_cmap *cmap, int wmode)
|
|
{
|
|
cmap->wmode = wmode;
|
|
}
|
|
|
|
void
|
|
pdf_debug_cmap(pdf_cmap *cmap)
|
|
{
|
|
int i, k, n;
|
|
|
|
printf("cmap $%p /%s {\n", (void *) cmap, cmap->cmap_name);
|
|
|
|
if (cmap->usecmap_name[0])
|
|
printf("\tusecmap /%s\n", cmap->usecmap_name);
|
|
if (cmap->usecmap)
|
|
printf("\tusecmap $%p\n", (void *) cmap->usecmap);
|
|
|
|
printf("\twmode %d\n", cmap->wmode);
|
|
|
|
printf("\tcodespaces {\n");
|
|
for (i = 0; i < cmap->codespace_len; i++)
|
|
{
|
|
printf("\t\t<%x> <%x>\n", cmap->codespace[i].low, cmap->codespace[i].high);
|
|
}
|
|
printf("\t}\n");
|
|
|
|
printf("\tranges (%d,%d) {\n", cmap->rlen, cmap->tlen);
|
|
for (i = 0; i < cmap->rlen; i++)
|
|
{
|
|
pdf_range *r = &cmap->ranges[i];
|
|
printf("\t\t<%04x> <%04x> ", r->low, pdf_range_high(r));
|
|
if (pdf_range_flags(r) == PDF_CMAP_TABLE)
|
|
{
|
|
printf("[ ");
|
|
for (k = 0; k < pdf_range_high(r) - r->low + 1; k++)
|
|
printf("%d ", cmap->table[r->offset + k]);
|
|
printf("]\n");
|
|
}
|
|
else if (pdf_range_flags(r) == PDF_CMAP_MULTI)
|
|
{
|
|
printf("< ");
|
|
n = cmap->table[r->offset];
|
|
for (k = 0; k < n; k++)
|
|
printf("%04x ", cmap->table[r->offset + 1 + k]);
|
|
printf(">\n");
|
|
}
|
|
else
|
|
printf("%d\n", r->offset);
|
|
}
|
|
printf("\t}\n}\n");
|
|
}
|
|
|
|
/*
|
|
* Add a codespacerange section.
|
|
* These ranges are used by pdf_decode_cmap to decode
|
|
* multi-byte encoded strings.
|
|
*/
|
|
void
|
|
pdf_add_codespace(pdf_cmap *cmap, int low, int high, int n)
|
|
{
|
|
if (cmap->codespace_len + 1 == nelem(cmap->codespace))
|
|
{
|
|
fz_warn("assert: too many code space ranges");
|
|
return;
|
|
}
|
|
|
|
cmap->codespace[cmap->codespace_len].n = n;
|
|
cmap->codespace[cmap->codespace_len].low = low;
|
|
cmap->codespace[cmap->codespace_len].high = high;
|
|
cmap->codespace_len ++;
|
|
}
|
|
|
|
/*
|
|
* Add an integer to the table.
|
|
*/
|
|
static void
|
|
add_table(pdf_cmap *cmap, int value)
|
|
{
|
|
if (cmap->tlen == USHRT_MAX)
|
|
{
|
|
fz_warn("cmap table is full; ignoring additional entries");
|
|
return;
|
|
}
|
|
if (cmap->tlen + 1 > cmap->tcap)
|
|
{
|
|
cmap->tcap = cmap->tcap > 1 ? (cmap->tcap * 3) / 2 : 256;
|
|
cmap->table = fz_realloc(cmap->table, cmap->tcap, sizeof(unsigned short));
|
|
}
|
|
cmap->table[cmap->tlen++] = value;
|
|
}
|
|
|
|
/*
|
|
* Add a range.
|
|
*/
|
|
static void
|
|
add_range(pdf_cmap *cmap, int low, int high, int flag, int offset)
|
|
{
|
|
/* If the range is too large to be represented, split it */
|
|
if (high - low > 0x3fff)
|
|
{
|
|
add_range(cmap, low, low+0x3fff, flag, offset);
|
|
add_range(cmap, low+0x3fff, high, flag, offset+0x3fff);
|
|
return;
|
|
}
|
|
if (cmap->rlen + 1 > cmap->rcap)
|
|
{
|
|
cmap->rcap = cmap->rcap > 1 ? (cmap->rcap * 3) / 2 : 256;
|
|
cmap->ranges = fz_realloc(cmap->ranges, cmap->rcap, sizeof(pdf_range));
|
|
}
|
|
cmap->ranges[cmap->rlen].low = low;
|
|
pdf_range_set_high(&cmap->ranges[cmap->rlen], high);
|
|
pdf_range_set_flags(&cmap->ranges[cmap->rlen], flag);
|
|
cmap->ranges[cmap->rlen].offset = offset;
|
|
cmap->rlen ++;
|
|
}
|
|
|
|
/*
|
|
* Add a range-to-table mapping.
|
|
*/
|
|
void
|
|
pdf_map_range_to_table(pdf_cmap *cmap, int low, int *table, int len)
|
|
{
|
|
int i;
|
|
int high = low + len;
|
|
int offset = cmap->tlen;
|
|
if (cmap->tlen + len >= USHRT_MAX)
|
|
fz_warn("cannot map range to table; table is full");
|
|
else
|
|
{
|
|
for (i = 0; i < len; i++)
|
|
add_table(cmap, table[i]);
|
|
add_range(cmap, low, high, PDF_CMAP_TABLE, offset);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Add a range of contiguous one-to-one mappings (ie 1..5 maps to 21..25)
|
|
*/
|
|
void
|
|
pdf_map_range_to_range(pdf_cmap *cmap, int low, int high, int offset)
|
|
{
|
|
add_range(cmap, low, high, high - low == 0 ? PDF_CMAP_SINGLE : PDF_CMAP_RANGE, offset);
|
|
}
|
|
|
|
/*
|
|
* Add a single one-to-many mapping.
|
|
*/
|
|
void
|
|
pdf_map_one_to_many(pdf_cmap *cmap, int low, int *values, int len)
|
|
{
|
|
int offset, i;
|
|
|
|
if (len == 1)
|
|
{
|
|
add_range(cmap, low, low, PDF_CMAP_SINGLE, values[0]);
|
|
return;
|
|
}
|
|
|
|
if (len > 8)
|
|
{
|
|
fz_warn("one to many mapping is too large (%d); truncating", len);
|
|
len = 8;
|
|
}
|
|
|
|
if (len == 2 &&
|
|
values[0] >= 0xD800 && values[0] <= 0xDBFF &&
|
|
values[1] >= 0xDC00 && values[1] <= 0xDFFF)
|
|
{
|
|
fz_warn("ignoring surrogate pair mapping in cmap");
|
|
return;
|
|
}
|
|
|
|
if (cmap->tlen + len + 1 >= USHRT_MAX)
|
|
fz_warn("cannot map one to many; table is full");
|
|
else
|
|
{
|
|
offset = cmap->tlen;
|
|
add_table(cmap, len);
|
|
for (i = 0; i < len; i++)
|
|
add_table(cmap, values[i]);
|
|
add_range(cmap, low, low, PDF_CMAP_MULTI, offset);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Sort the input ranges.
|
|
* Merge contiguous input ranges to range-to-range if the output is contiguous.
|
|
* Merge contiguous input ranges to range-to-table if the output is random.
|
|
*/
|
|
|
|
static int cmprange(const void *va, const void *vb)
|
|
{
|
|
return ((const pdf_range*)va)->low - ((const pdf_range*)vb)->low;
|
|
}
|
|
|
|
void
|
|
pdf_sort_cmap(pdf_cmap *cmap)
|
|
{
|
|
pdf_range *a; /* last written range on output */
|
|
pdf_range *b; /* current range examined on input */
|
|
|
|
if (cmap->rlen == 0)
|
|
return;
|
|
|
|
qsort(cmap->ranges, cmap->rlen, sizeof(pdf_range), cmprange);
|
|
|
|
if (cmap->tlen == USHRT_MAX)
|
|
{
|
|
fz_warn("cmap table is full; will not combine ranges");
|
|
return;
|
|
}
|
|
|
|
a = cmap->ranges;
|
|
b = cmap->ranges + 1;
|
|
|
|
while (b < cmap->ranges + cmap->rlen)
|
|
{
|
|
/* ignore one-to-many mappings */
|
|
if (pdf_range_flags(b) == PDF_CMAP_MULTI)
|
|
{
|
|
*(++a) = *b;
|
|
}
|
|
|
|
/* input contiguous */
|
|
else if (pdf_range_high(a) + 1 == b->low)
|
|
{
|
|
/* output contiguous */
|
|
if (pdf_range_high(a) - a->low + a->offset + 1 == b->offset)
|
|
{
|
|
/* SR -> R and SS -> R and RR -> R and RS -> R */
|
|
if ((pdf_range_flags(a) == PDF_CMAP_SINGLE || pdf_range_flags(a) == PDF_CMAP_RANGE) && (pdf_range_high(b) - a->low <= 0x3fff))
|
|
{
|
|
pdf_range_set_flags(a, PDF_CMAP_RANGE);
|
|
pdf_range_set_high(a, pdf_range_high(b));
|
|
}
|
|
|
|
/* LS -> L */
|
|
else if (pdf_range_flags(a) == PDF_CMAP_TABLE && pdf_range_flags(b) == PDF_CMAP_SINGLE && (pdf_range_high(b) - a->low <= 0x3fff))
|
|
{
|
|
pdf_range_set_high(a, pdf_range_high(b));
|
|
add_table(cmap, b->offset);
|
|
}
|
|
|
|
/* LR -> LR */
|
|
else if (pdf_range_flags(a) == PDF_CMAP_TABLE && pdf_range_flags(b) == PDF_CMAP_RANGE)
|
|
{
|
|
*(++a) = *b;
|
|
}
|
|
|
|
/* XX -> XX */
|
|
else
|
|
{
|
|
*(++a) = *b;
|
|
}
|
|
}
|
|
|
|
/* output separated */
|
|
else
|
|
{
|
|
/* SS -> L */
|
|
if (pdf_range_flags(a) == PDF_CMAP_SINGLE && pdf_range_flags(b) == PDF_CMAP_SINGLE)
|
|
{
|
|
pdf_range_set_flags(a, PDF_CMAP_TABLE);
|
|
pdf_range_set_high(a, pdf_range_high(b));
|
|
add_table(cmap, a->offset);
|
|
add_table(cmap, b->offset);
|
|
a->offset = cmap->tlen - 2;
|
|
}
|
|
|
|
/* LS -> L */
|
|
else if (pdf_range_flags(a) == PDF_CMAP_TABLE && pdf_range_flags(b) == PDF_CMAP_SINGLE && (pdf_range_high(b) - a->low <= 0x3fff))
|
|
{
|
|
pdf_range_set_high(a, pdf_range_high(b));
|
|
add_table(cmap, b->offset);
|
|
}
|
|
|
|
/* XX -> XX */
|
|
else
|
|
{
|
|
*(++a) = *b;
|
|
}
|
|
}
|
|
}
|
|
|
|
/* input separated: XX -> XX */
|
|
else
|
|
{
|
|
*(++a) = *b;
|
|
}
|
|
|
|
b ++;
|
|
}
|
|
|
|
cmap->rlen = a - cmap->ranges + 1;
|
|
|
|
fz_flush_warnings();
|
|
}
|
|
|
|
/*
|
|
* Lookup the mapping of a codepoint.
|
|
*/
|
|
int
|
|
pdf_lookup_cmap(pdf_cmap *cmap, int cpt)
|
|
{
|
|
int l = 0;
|
|
int r = cmap->rlen - 1;
|
|
int m;
|
|
|
|
while (l <= r)
|
|
{
|
|
m = (l + r) >> 1;
|
|
if (cpt < cmap->ranges[m].low)
|
|
r = m - 1;
|
|
else if (cpt > pdf_range_high(&cmap->ranges[m]))
|
|
l = m + 1;
|
|
else
|
|
{
|
|
int i = cpt - cmap->ranges[m].low + cmap->ranges[m].offset;
|
|
if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_TABLE)
|
|
return cmap->table[i];
|
|
if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_MULTI)
|
|
return -1; /* should use lookup_cmap_full */
|
|
return i;
|
|
}
|
|
}
|
|
|
|
if (cmap->usecmap)
|
|
return pdf_lookup_cmap(cmap->usecmap, cpt);
|
|
|
|
return -1;
|
|
}
|
|
|
|
int
|
|
pdf_lookup_cmap_full(pdf_cmap *cmap, int cpt, int *out)
|
|
{
|
|
int i, k, n;
|
|
int l = 0;
|
|
int r = cmap->rlen - 1;
|
|
int m;
|
|
|
|
while (l <= r)
|
|
{
|
|
m = (l + r) >> 1;
|
|
if (cpt < cmap->ranges[m].low)
|
|
r = m - 1;
|
|
else if (cpt > pdf_range_high(&cmap->ranges[m]))
|
|
l = m + 1;
|
|
else
|
|
{
|
|
k = cpt - cmap->ranges[m].low + cmap->ranges[m].offset;
|
|
if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_TABLE)
|
|
{
|
|
out[0] = cmap->table[k];
|
|
return 1;
|
|
}
|
|
else if (pdf_range_flags(&cmap->ranges[m]) == PDF_CMAP_MULTI)
|
|
{
|
|
n = cmap->ranges[m].offset;
|
|
for (i = 0; i < cmap->table[n]; i++)
|
|
out[i] = cmap->table[n + i + 1];
|
|
return cmap->table[n];
|
|
}
|
|
else
|
|
{
|
|
out[0] = k;
|
|
return 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (cmap->usecmap)
|
|
return pdf_lookup_cmap_full(cmap->usecmap, cpt, out);
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Use the codespace ranges to extract a codepoint from a
|
|
* multi-byte encoded string.
|
|
*/
|
|
unsigned char *
|
|
pdf_decode_cmap(pdf_cmap *cmap, unsigned char *buf, int *cpt)
|
|
{
|
|
int k, n, c;
|
|
|
|
c = 0;
|
|
for (n = 0; n < 4; n++)
|
|
{
|
|
c = (c << 8) | buf[n];
|
|
for (k = 0; k < cmap->codespace_len; k++)
|
|
{
|
|
if (cmap->codespace[k].n == n + 1)
|
|
{
|
|
if (c >= cmap->codespace[k].low && c <= cmap->codespace[k].high)
|
|
{
|
|
*cpt = c;
|
|
return buf + n + 1;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
*cpt = 0;
|
|
return buf + 1;
|
|
}
|