kolibrios/contrib/network/netsurf/libparserutils/test/cscodec-utf16.c

#include <ctype.h>
#include <stdio.h>
#include <string.h>

/* These two are for htonl / ntohl */
#include <arpa/inet.h>
#include <netinet/in.h>

#include <parserutils/charset/codec.h>

#include "utils/utils.h"

#include "testutils.h"

typedef struct line_ctx {
	parserutils_charset_codec *codec;

	size_t buflen;
	size_t bufused;
	uint8_t *buf;
	size_t explen;
	size_t expused;
	uint8_t *exp;

	bool indata;
	bool inexp;

	parserutils_error exp_ret;

	enum { ENCODE, DECODE, BOTH } dir;
} line_ctx;

static bool handle_line(const char *data, size_t datalen, void *pw);
static void run_test(line_ctx *ctx);

static void *myrealloc(void *ptr, size_t len, void *pw)
{
	UNUSED(pw);

	return realloc(ptr, len);
}

int main(int argc, char **argv)
{
	parserutils_charset_codec *codec;
	line_ctx ctx;

	if (argc != 2) {
		printf("Usage: %s <filename>\n", argv[0]);
		return 1;
	}

	assert(parserutils_charset_codec_create("NATS-SEFI-ADD",
			myrealloc, NULL, &codec) == PARSERUTILS_BADENCODING);

	assert(parserutils_charset_codec_create("UTF-16", myrealloc, NULL,
			&ctx.codec) == PARSERUTILS_OK);

	ctx.buflen = parse_filesize(argv[1]);
	if (ctx.buflen == 0)
		return 1;

	ctx.buf = malloc(ctx.buflen);
	if (ctx.buf == NULL) {
		printf("Failed allocating %u bytes\n", (int) ctx.buflen);
		return 1;
	}

	ctx.exp = malloc(ctx.buflen);
	if (ctx.exp == NULL) {
		printf("Failed allocating %u bytes\n", (int) ctx.buflen);
		free(ctx.buf);
		return 1;
	}
	ctx.explen = ctx.buflen;

	ctx.buf[0] = '\0';
	ctx.exp[0] = '\0';
	ctx.bufused = 0;
	ctx.expused = 0;
	ctx.indata = false;
	ctx.inexp = false;
	ctx.exp_ret = PARSERUTILS_OK;

	assert(parse_testfile(argv[1], handle_line, &ctx) == true);

	/* and run final test */
	if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
		ctx.bufused -= 1;

	if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')
		ctx.expused -= 1;

	run_test(&ctx);

	free(ctx.buf);

	parserutils_charset_codec_destroy(ctx.codec);

	printf("PASS\n");

	return 0;
}

/**
 * Converts hex character ('0' ... '9' or 'a' ... 'f' or 'A' ... 'F') to
 * digit value.
 * \param hex Valid hex character
 * \return Corresponding digit value.
 */
static inline int hex2digit(char hex)
{
	return (hex <= '9') ? hex - '0' : (hex | 0x20) - 'a' + 10;
}

bool handle_line(const char *data, size_t datalen, void *pw)
{
	line_ctx *ctx = (line_ctx *) pw;

	if (data[0] == '#') {
		if (ctx->inexp) {
			/* This marks end of testcase, so run it */

			if (ctx->buf[ctx->bufused - 1] == '\n')
				ctx->bufused -= 1;

			if (ctx->exp[ctx->expused - 1] == '\n')
				ctx->expused -= 1;

			run_test(ctx);

			ctx->buf[0] = '\0';
			ctx->exp[0] = '\0';
			ctx->bufused = 0;
			ctx->expused = 0;
			ctx->exp_ret = PARSERUTILS_OK;
		}

		if (strncasecmp(data+1, "data", 4) == 0) {
			parserutils_charset_codec_optparams params;
			const char *ptr = data + 6;

			ctx->indata = true;
			ctx->inexp = false;

			if (strncasecmp(ptr, "decode", 6) == 0)
				ctx->dir = DECODE;
			else if (strncasecmp(ptr, "encode", 6) == 0)
				ctx->dir = ENCODE;
			else
				ctx->dir = BOTH;

			ptr += 7;

			if (strncasecmp(ptr, "LOOSE", 5) == 0) {
				params.error_mode.mode =
					PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
				ptr += 6;
			} else if (strncasecmp(ptr, "STRICT", 6) == 0) {
				params.error_mode.mode =
					PARSERUTILS_CHARSET_CODEC_ERROR_STRICT;
				ptr += 7;
			} else {
				params.error_mode.mode =
					PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT;
				ptr += 9;
			}

			assert(parserutils_charset_codec_setopt(ctx->codec,
				PARSERUTILS_CHARSET_CODEC_ERROR_MODE,
				(parserutils_charset_codec_optparams *) &params)
				== PARSERUTILS_OK);
		} else if (strncasecmp(data+1, "expected", 8) == 0) {
			ctx->indata = false;
			ctx->inexp = true;

			ctx->exp_ret = parserutils_error_from_string(data + 10,
					datalen - 10 - 1 /* \n */);
		} else if (strncasecmp(data+1, "reset", 5) == 0) {
			ctx->indata = false;
			ctx->inexp = false;

			parserutils_charset_codec_reset(ctx->codec);
		}
	} else {
		if (ctx->indata) {
			/* Process "&#xNNNN" as 16-bit code units.  */
			while (datalen) {
				uint16_t nCodePoint;

				if (data[0] == '\n') {
					ctx->buf[ctx->bufused++] = *data++;
					--datalen;
					continue;
				}
				assert(datalen >= sizeof ("&#xNNNN")-1 \
					&& data[0] == '&' && data[1] == '#' \
					&& data[2] == 'x' && isxdigit(data[3]) \
					&& isxdigit(data[4]) && isxdigit(data[5]) \
					&& isxdigit(data[6]));
				/* UTF-16 code is always host endian (different
				   than UCS-32 !).  */
				nCodePoint = (hex2digit(data[3]) << 12) | 
						(hex2digit(data[4]) <<  8) | 
						(hex2digit(data[5]) <<  4) | 
						hex2digit(data[6]);
				*((uint16_t *) (void *) (ctx->buf + ctx->bufused)) = 
						nCodePoint;
				ctx->bufused += 2;
				data += sizeof ("&#xNNNN")-1;
				datalen -= sizeof ("&#xNNNN")-1;
			}
		}
		if (ctx->inexp) {
			/* Process "&#xXXXXYYYY as 32-bit code units.  */
			while (datalen) {
				uint32_t nCodePoint;

				if (data[0] == '\n') {
					ctx->exp[ctx->expused++] = *data++;
					--datalen;
					continue;
				}
				assert(datalen >= sizeof ("&#xXXXXYYYY")-1 \
					&& data[0] == '&' && data[1] == '#' \
					&& data[2] == 'x' && isxdigit(data[3]) \
					&& isxdigit(data[4]) && isxdigit(data[5]) \
					&& isxdigit(data[6]) && isxdigit(data[7]) \
					&& isxdigit(data[8]) && isxdigit(data[9]) \
					&& isxdigit(data[10]));
				/* UCS-4 code is always big endian, so convert
				   host endian to big endian.  */
				nCodePoint =
					htonl((hex2digit(data[3]) << 28)
					| (hex2digit(data[4]) << 24)
					| (hex2digit(data[5]) << 20)
					| (hex2digit(data[6]) << 16)
					| (hex2digit(data[7]) << 12)
					| (hex2digit(data[8]) << 8)
					| (hex2digit(data[9]) << 4)
					| hex2digit(data[10]));
				*((uint32_t *) (void *) (ctx->exp + ctx->expused)) = 
						nCodePoint;
				ctx->expused += 4;
				data += sizeof ("&#xXXXXYYYY")-1;
				datalen -= sizeof ("&#xXXXXYYYY")-1;
			}
		}
	}

	return true;
}

void run_test(line_ctx *ctx)
{
	static int testnum;
	size_t destlen = ctx->bufused * 4;
	uint8_t *dest = alloca(destlen);
	uint8_t *pdest = dest;
	const uint8_t *psrc = ctx->buf;
	size_t srclen = ctx->bufused;
	size_t i;

	if (ctx->dir == DECODE) {
		assert(parserutils_charset_codec_decode(ctx->codec,
				&psrc, &srclen,
				&pdest, &destlen) == ctx->exp_ret);
	} else if (ctx->dir == ENCODE) {
		assert(parserutils_charset_codec_encode(ctx->codec,
				&psrc, &srclen,
				&pdest, &destlen) == ctx->exp_ret);
	} else {
		size_t templen = ctx->bufused * 4;
		uint8_t *temp = alloca(templen);
		uint8_t *ptemp = temp;
		const uint8_t *ptemp2;
		size_t templen2;

		assert(parserutils_charset_codec_decode(ctx->codec,
				&psrc, &srclen,
				&ptemp, &templen) == ctx->exp_ret);
		/* \todo currently there is no way to specify the number of
		   consumed & produced data in case of a deliberate bad input
		   data set.  */
		if (ctx->exp_ret == PARSERUTILS_OK) {
			assert(temp + (ctx->bufused * 4 - templen) == ptemp);
		}

		ptemp2 = temp;
		templen2 = ctx->bufused * 4 - templen;
		assert(parserutils_charset_codec_encode(ctx->codec,
				&ptemp2, &templen2,
				&pdest, &destlen) == ctx->exp_ret);
		if (ctx->exp_ret == PARSERUTILS_OK) {
			assert(templen2 == 0);
			assert(temp + (ctx->bufused * 4 - templen) == ptemp2);
		}
	}
	if (ctx->exp_ret == PARSERUTILS_OK) {
		assert(srclen == 0);
		assert(ctx->buf + ctx->bufused == psrc);
		assert(dest + (ctx->bufused * 4 - destlen) == pdest);
		assert(ctx->bufused * 4 - destlen == ctx->expused);
	}

	printf("%d: Read '", ++testnum);
	for (i = 0; i < ctx->expused; i++) {
		printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],
				"0123456789abcdef"[dest[i] & 0xf]);
	}
	printf("' Expected '");
	for (i = 0; i < ctx->expused; i++) {
		printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],
				"0123456789abcdef"[ctx->exp[i] & 0xf]);
	}
	printf("'\n");

	assert(pdest == dest + ctx->expused);
	assert(memcmp(dest, ctx->exp, ctx->expused) == 0);
}
Netsurf initial port (still needs native ui and cURL) git-svn-id: svn://kolibrios.org@3584 a494cfbc-eb01-0410-851d-a64ba20cac60 2013-06-01 19:14:14 +02:00			`#include <ctype.h>`
			`#include <stdio.h>`
			`#include <string.h>`

			`/* These two are for htonl / ntohl */`
			`#include <arpa/inet.h>`
			`#include <netinet/in.h>`

			`#include <parserutils/charset/codec.h>`

			`#include "utils/utils.h"`

			`#include "testutils.h"`

			`typedef struct line_ctx {`
			`parserutils_charset_codec *codec;`

			`size_t buflen;`
			`size_t bufused;`
			`uint8_t *buf;`
			`size_t explen;`
			`size_t expused;`
			`uint8_t *exp;`

			`bool indata;`
			`bool inexp;`

			`parserutils_error exp_ret;`

			`enum { ENCODE, DECODE, BOTH } dir;`
			`} line_ctx;`

			`static bool handle_line(const char data, size_t datalen, void pw);`
			`static void run_test(line_ctx *ctx);`

			`static void myrealloc(void ptr, size_t len, void *pw)`
			`{`
			`UNUSED(pw);`

			`return realloc(ptr, len);`
			`}`

			`int main(int argc, char **argv)`
			`{`
			`parserutils_charset_codec *codec;`
			`line_ctx ctx;`

			`if (argc != 2) {`
			`printf("Usage: %s <filename>\n", argv[0]);`
			`return 1;`
			`}`

			`assert(parserutils_charset_codec_create("NATS-SEFI-ADD",`
			`myrealloc, NULL, &codec) == PARSERUTILS_BADENCODING);`

			`assert(parserutils_charset_codec_create("UTF-16", myrealloc, NULL,`
			`&ctx.codec) == PARSERUTILS_OK);`

			`ctx.buflen = parse_filesize(argv[1]);`
			`if (ctx.buflen == 0)`
			`return 1;`

			`ctx.buf = malloc(ctx.buflen);`
			`if (ctx.buf == NULL) {`
			`printf("Failed allocating %u bytes\n", (int) ctx.buflen);`
			`return 1;`
			`}`

			`ctx.exp = malloc(ctx.buflen);`
			`if (ctx.exp == NULL) {`
			`printf("Failed allocating %u bytes\n", (int) ctx.buflen);`
			`free(ctx.buf);`
			`return 1;`
			`}`
			`ctx.explen = ctx.buflen;`

			`ctx.buf[0] = '\0';`
			`ctx.exp[0] = '\0';`
			`ctx.bufused = 0;`
			`ctx.expused = 0;`
			`ctx.indata = false;`
			`ctx.inexp = false;`
			`ctx.exp_ret = PARSERUTILS_OK;`

			`assert(parse_testfile(argv[1], handle_line, &ctx) == true);`

			`/* and run final test */`
			`if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')`
			`ctx.bufused -= 1;`

			`if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')`
			`ctx.expused -= 1;`

			`run_test(&ctx);`

			`free(ctx.buf);`

			`parserutils_charset_codec_destroy(ctx.codec);`

			`printf("PASS\n");`

			`return 0;`
			`}`

			`/**`
			`* Converts hex character ('0' ... '9' or 'a' ... 'f' or 'A' ... 'F') to`
			`* digit value.`
			`* \param hex Valid hex character`
			`* \return Corresponding digit value.`
			`*/`
			`static inline int hex2digit(char hex)`
			`{`
			`return (hex <= '9') ? hex - '0' : (hex \| 0x20) - 'a' + 10;`
			`}`

			`bool handle_line(const char data, size_t datalen, void pw)`
			`{`
			`line_ctx ctx = (line_ctx ) pw;`

			`if (data[0] == '#') {`
			`if (ctx->inexp) {`
			`/* This marks end of testcase, so run it */`

			`if (ctx->buf[ctx->bufused - 1] == '\n')`
			`ctx->bufused -= 1;`

			`if (ctx->exp[ctx->expused - 1] == '\n')`
			`ctx->expused -= 1;`

			`run_test(ctx);`

			`ctx->buf[0] = '\0';`
			`ctx->exp[0] = '\0';`
			`ctx->bufused = 0;`
			`ctx->expused = 0;`
			`ctx->exp_ret = PARSERUTILS_OK;`
			`}`

			`if (strncasecmp(data+1, "data", 4) == 0) {`
			`parserutils_charset_codec_optparams params;`
			`const char *ptr = data + 6;`

			`ctx->indata = true;`
			`ctx->inexp = false;`

			`if (strncasecmp(ptr, "decode", 6) == 0)`
			`ctx->dir = DECODE;`
			`else if (strncasecmp(ptr, "encode", 6) == 0)`
			`ctx->dir = ENCODE;`
			`else`
			`ctx->dir = BOTH;`

			`ptr += 7;`

			`if (strncasecmp(ptr, "LOOSE", 5) == 0) {`
			`params.error_mode.mode =`
			`PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;`
			`ptr += 6;`
			`} else if (strncasecmp(ptr, "STRICT", 6) == 0) {`
			`params.error_mode.mode =`
			`PARSERUTILS_CHARSET_CODEC_ERROR_STRICT;`
			`ptr += 7;`
			`} else {`
			`params.error_mode.mode =`
			`PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT;`
			`ptr += 9;`
			`}`

			`assert(parserutils_charset_codec_setopt(ctx->codec,`
			`PARSERUTILS_CHARSET_CODEC_ERROR_MODE,`
			`(parserutils_charset_codec_optparams *) &params)`
			`== PARSERUTILS_OK);`
			`} else if (strncasecmp(data+1, "expected", 8) == 0) {`
			`ctx->indata = false;`
			`ctx->inexp = true;`

			`ctx->exp_ret = parserutils_error_from_string(data + 10,`
			`datalen - 10 - 1 /* \n */);`
			`} else if (strncasecmp(data+1, "reset", 5) == 0) {`
			`ctx->indata = false;`
			`ctx->inexp = false;`

			`parserutils_charset_codec_reset(ctx->codec);`
			`}`
			`} else {`
			`if (ctx->indata) {`
			`/* Process "&#xNNNN" as 16-bit code units. */`
			`while (datalen) {`
			`uint16_t nCodePoint;`

			`if (data[0] == '\n') {`
			`ctx->buf[ctx->bufused++] = *data++;`
			`--datalen;`
			`continue;`
			`}`
			`assert(datalen >= sizeof ("&#xNNNN")-1 \`
			`&& data[0] == '&' && data[1] == '#' \`
			`&& data[2] == 'x' && isxdigit(data[3]) \`
			`&& isxdigit(data[4]) && isxdigit(data[5]) \`
			`&& isxdigit(data[6]));`
			`/* UTF-16 code is always host endian (different`
			`than UCS-32 !). */`
			`nCodePoint = (hex2digit(data[3]) << 12) \|`
			`(hex2digit(data[4]) << 8) \|`
			`(hex2digit(data[5]) << 4) \|`
			`hex2digit(data[6]);`
			`((uint16_t ) (void *) (ctx->buf + ctx->bufused)) =`
			`nCodePoint;`
			`ctx->bufused += 2;`
			`data += sizeof ("&#xNNNN")-1;`
			`datalen -= sizeof ("&#xNNNN")-1;`
			`}`
			`}`
			`if (ctx->inexp) {`
			`/* Process "&#xXXXXYYYY as 32-bit code units. */`
			`while (datalen) {`
			`uint32_t nCodePoint;`

			`if (data[0] == '\n') {`
			`ctx->exp[ctx->expused++] = *data++;`
			`--datalen;`
			`continue;`
			`}`
			`assert(datalen >= sizeof ("&#xXXXXYYYY")-1 \`
			`&& data[0] == '&' && data[1] == '#' \`
			`&& data[2] == 'x' && isxdigit(data[3]) \`
			`&& isxdigit(data[4]) && isxdigit(data[5]) \`
			`&& isxdigit(data[6]) && isxdigit(data[7]) \`
			`&& isxdigit(data[8]) && isxdigit(data[9]) \`
			`&& isxdigit(data[10]));`
			`/* UCS-4 code is always big endian, so convert`
			`host endian to big endian. */`
			`nCodePoint =`
			`htonl((hex2digit(data[3]) << 28)`
			`\| (hex2digit(data[4]) << 24)`
			`\| (hex2digit(data[5]) << 20)`
			`\| (hex2digit(data[6]) << 16)`
			`\| (hex2digit(data[7]) << 12)`
			`\| (hex2digit(data[8]) << 8)`
			`\| (hex2digit(data[9]) << 4)`
			`\| hex2digit(data[10]));`
			`((uint32_t ) (void *) (ctx->exp + ctx->expused)) =`
			`nCodePoint;`
			`ctx->expused += 4;`
			`data += sizeof ("&#xXXXXYYYY")-1;`
			`datalen -= sizeof ("&#xXXXXYYYY")-1;`
			`}`
			`}`
			`}`

			`return true;`
			`}`

			`void run_test(line_ctx *ctx)`
			`{`
			`static int testnum;`
			`size_t destlen = ctx->bufused * 4;`
			`uint8_t *dest = alloca(destlen);`
			`uint8_t *pdest = dest;`
			`const uint8_t *psrc = ctx->buf;`
			`size_t srclen = ctx->bufused;`
			`size_t i;`

			`if (ctx->dir == DECODE) {`
			`assert(parserutils_charset_codec_decode(ctx->codec,`
			`&psrc, &srclen,`
			`&pdest, &destlen) == ctx->exp_ret);`
			`} else if (ctx->dir == ENCODE) {`
			`assert(parserutils_charset_codec_encode(ctx->codec,`
			`&psrc, &srclen,`
			`&pdest, &destlen) == ctx->exp_ret);`
			`} else {`
			`size_t templen = ctx->bufused * 4;`
			`uint8_t *temp = alloca(templen);`
			`uint8_t *ptemp = temp;`
			`const uint8_t *ptemp2;`
			`size_t templen2;`

			`assert(parserutils_charset_codec_decode(ctx->codec,`
			`&psrc, &srclen,`
			`&ptemp, &templen) == ctx->exp_ret);`
			`/* \todo currently there is no way to specify the number of`
			`consumed & produced data in case of a deliberate bad input`
			`data set. */`
			`if (ctx->exp_ret == PARSERUTILS_OK) {`
			`assert(temp + (ctx->bufused * 4 - templen) == ptemp);`
			`}`

			`ptemp2 = temp;`
			`templen2 = ctx->bufused * 4 - templen;`
			`assert(parserutils_charset_codec_encode(ctx->codec,`
			`&ptemp2, &templen2,`
			`&pdest, &destlen) == ctx->exp_ret);`
			`if (ctx->exp_ret == PARSERUTILS_OK) {`
			`assert(templen2 == 0);`
			`assert(temp + (ctx->bufused * 4 - templen) == ptemp2);`
			`}`
			`}`
			`if (ctx->exp_ret == PARSERUTILS_OK) {`
			`assert(srclen == 0);`
			`assert(ctx->buf + ctx->bufused == psrc);`
			`assert(dest + (ctx->bufused * 4 - destlen) == pdest);`
			`assert(ctx->bufused * 4 - destlen == ctx->expused);`
			`}`

			`printf("%d: Read '", ++testnum);`
			`for (i = 0; i < ctx->expused; i++) {`
			`printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],`
			`"0123456789abcdef"[dest[i] & 0xf]);`
			`}`
			`printf("' Expected '");`
			`for (i = 0; i < ctx->expused; i++) {`
			`printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],`
			`"0123456789abcdef"[ctx->exp[i] & 0xf]);`
			`}`
			`printf("'\n");`

			`assert(pdest == dest + ctx->expused);`
			`assert(memcmp(dest, ctx->exp, ctx->expused) == 0);`
			`}`