#include "fitz.h"

typedef unsigned char byte;

/* These C implementations use SWAR (SIMD-within-a-register) techniques. */

#if 0 /* TODO: move into porterduff.c functions */

#define MASK 0xFF00FF00;

static void
path_w4i1o4_32bit(byte *rgba,
	byte * restrict src, byte cov, int len, byte * restrict dst)
{
	/* COLOR * coverage + DST * (256-coverage) = (COLOR - DST)*coverage + DST*256 */
	unsigned int *dst32 = (unsigned int *)(void *)dst;
	int alpha = rgba[3];
	unsigned int rb = rgba[0] | (rgba[2] << 16);
	unsigned int ga = rgba[1] | 0xFF0000;

	if (alpha == 0)
		return;

	if (alpha != 255)
	{
		alpha += alpha>>7; /* alpha is now in the 0...256 range */
		while (len--)
		{
			unsigned int ca, drb, dga, crb, cga;
			cov += *src; *src++ = 0;
			ca = cov + (cov>>7); /* ca is in 0...256 range */
			ca = (ca*alpha)>>8; /* ca is is in 0...256 range */
			drb = *dst32++;
			if (ca != 0)
			{
				dga = drb & MASK;
				drb = (drb<<8) & MASK;
				cga = ga - (dga>>8);
				crb = rb - (drb>>8);
				dga += cga * ca;
				drb += crb * ca;
				dga &= MASK;
				drb &= MASK;
				drb = dga | (drb>>8);
				dst32[-1] = drb;
			}
		}
	}
	else
	{
		while (len--)
		{
			unsigned int ca, drb, dga, crb, cga;
			cov += *src; *src++ = 0;
			ca = cov + (cov>>7); /* ca is in 0...256 range */
			drb = *dst32++;
			if (ca == 0)
				continue;
			if (ca == 255)
			{
				drb = (ga<<8) | rb;
			}
			else
			{
				dga = drb & MASK;
				drb = (drb<<8) & MASK;
				cga = ga - (dga>>8);
				crb = rb - (drb>>8);
				dga += cga * ca;
				drb += crb * ca;
				dga &= MASK;
				drb &= MASK;
				drb = dga |(drb>>8);
			}
			dst32[-1] = drb;
		}
	}
}

static void
text_w4i1o4_32bit(byte *rgba,
	byte * restrict src, int srcw,
	byte * restrict dst, int dstw, int w0, int h)
{
	unsigned int *dst32 = (unsigned int *)(void *)dst;
	unsigned int alpha = rgba[3];
	unsigned int rb = rgba[0] | (rgba[2] << 16);
	unsigned int ga = rgba[1] | 0xFF0000;

	if (alpha == 0)
		return;

	srcw -= w0;
	dstw = (dstw>>2)-w0;

	if (alpha != 255)
	{
		alpha += alpha>>7; /* alpha is now in the 0...256 range */
		while (h--)
		{
			int w = w0;
			while (w--)
			{
				unsigned int ca, drb, dga, crb, cga;
				ca = *src++;
				drb = *dst32++;
				ca += ca>>7;
				ca = (ca*alpha)>>8;
				if (ca == 0)
					continue;
				dga = drb & MASK;
				drb = (drb<<8) & MASK;
				cga = ga - (dga>>8);
				crb = rb - (drb>>8);
				dga += cga * ca;
				drb += crb * ca;
				dga &= MASK;
				drb &= MASK;
				drb = dga | (drb>>8);
				dst32[-1] = drb;
			}
			src += srcw;
			dst32 += dstw;
		}
	}
	else
	{
		while (h--)
		{
			int w = w0;
			while (w--)
			{
				unsigned int ca, drb, dga, crb, cga;
				ca = *src++;
				drb = *dst32++;
				ca += ca>>7;
				if (ca == 0)
					continue;
				dga = drb & MASK;
				drb = (drb<<8) & MASK;
				cga = ga - (dga>>8);
				crb = rb - (drb>>8);
				dga += cga * ca;
				drb += crb * ca;
				dga &= MASK;
				drb &= MASK;
				drb = dga | (drb>>8);
				dst32[-1] = drb;
			}
			src += srcw;
			dst32 += dstw;
		}
	}
}

static void
img_4o4_32bit(byte * restrict src, byte cov, int len, byte * restrict dst,
	fz_pixmap *image, int u, int v, int fa, int fb)
{
	unsigned int *dst32 = (unsigned int *)(void *)dst;
	unsigned int *samples = (unsigned int *)(void *)image->samples;
	int w = image->w;
	int h = image->h-1;

	while (len--)
	{
		unsigned int a, a1, d, d1;
		int sa;
		cov += *src; *src = 0; src++;
		/* (a,a1) = sampleargb(samples, w, h, u, v, argb); */
		{
			int ui, ui1, vi, vi1, ud, vd;
			unsigned int b, b1, c, c1;
			ui1 = 1;
			ui = u >> 16;
			if (ui < 0)
			{
				ui = 0;
				ui1 = 0;
			}
			else if (ui >= w-1)
			{
				ui = w-1;
				ui1 = 0;
			}
			vi1 = w;
			vi = v >> 16;
			if (vi < 0)
			{
				vi = 0;
				vi1 = 0;
			}
			else if (vi >= h)
			{
				vi = h;
				vi1 = 0;
			}
			ui += vi*w;
			a = samples[ui];
			b = samples[ui + ui1];
			c = samples[ui + vi1];
			d = samples[ui + ui1 + vi1];
			ud = (u>>8) & 0xFF;
			vd = (v>>8) & 0xFF;
			ud = FZ_EXPAND(ud);
			vd = FZ_EXPAND(vd);
			/* (a,a1) = blend(a,b,ud) */
			a1 = a & MASK;
			a = (a<<8) & MASK;
			b1 = (b>>8) & ~MASK;
			b = b & ~MASK;
			a = ((b -(a >>8)) * ud + a ) & MASK;
			a1 = ((b1-(a1>>8)) * ud + a1) & MASK;
			/* (c,c1) = blend(c,d,ud) */
			c1 = c & MASK;
			c = (c<<8) & MASK;
			d1 = (d>>8) & ~MASK;
			d = d & ~MASK;
			c = ((d -(c >>8)) * ud + c ) & MASK;
			c1 = ((d1-(c1>>8)) * ud + c1) & MASK;
			/* (a,a1) = blend((a,a1),(c,c1),vd) */
			a = (((c >>8)-(a >>8)) * vd + a ) & MASK;
			a1 = (((c1>>8)-(a1>>8)) * vd + a1) & MASK;
		}
		sa = (a1>>24);
		sa = FZ_COMBINE(FZ_EXPAND(sa), FZ_EXPAND(cov));
		a1 |= 0xFF000000;
		d = *dst32++;
		d1 = d & MASK;
		d = (d<<8) & MASK;
		a = (((a >>8)-(d >>8)) * sa + d ) & MASK;
		a1 = (((a1>>8)-(d1>>8)) * sa + d1) & MASK;
		dst32[-1] = (a>>8) | a1;
		u += fa;
		v += fb;
	}
}

static void
img_w4i1o4_32bit(byte *rgba, byte * restrict src, byte cov, int len,
	byte * restrict dst, fz_pixmap *image, int u, int v, int fa, int fb)
{
	byte *samples = image->samples;
	int w = image->w;
	int h = image->h-1;
	int alpha = FZ_EXPAND(rgba[3]);
	unsigned int rb = rgba[0] | (rgba[2] << 16);
	unsigned int ga = rgba[1] | 0xFF0000;
	unsigned int *dst32 = (unsigned int *)(void *)dst;

	if (alpha == 0)
		return;
	if (alpha != 256)
	{
		while (len--)
		{
			unsigned int ca, drb, dga, crb, cga;
			unsigned int a, b;
			cov += *src; *src = 0; src++;
			drb = *dst32++;
			ca = FZ_COMBINE(FZ_EXPAND(cov), alpha);
			if (ca != 0)
			{
				int ui, ui1, vi, vi1, ud, vd;
				/* a = samplemask(samples, w, h, u, v); */
				ui1 = 1;
				ui = u >> 16;
				if (ui < 0)
				{
					ui = 0;
					ui1 = 0;
				}
				else if (ui >= w-1)
				{
					ui = w-1;
					ui1 = 0;
				}
				vi1 = w;
				vi = v >> 16;
				if (vi < 0)
				{
					vi = 0;
					vi1 = 0;
				}
				else if (vi >= h)
				{
					vi = h;
					vi1 = 0;
				}
				ui += vi*w;
				a = samples[ui];
				b = samples[ui + ui1];
				a |= samples[ui + vi1]<<16;
				b |= samples[ui + ui1 + vi1]<<16;
				ud = (u>>8) & 0xFF;
				vd = (v>>8) & 0xFF;
				ud = FZ_EXPAND(ud);
				vd = FZ_EXPAND(vd);
				/* a = blend(a,b,ud) */
				a = ((b-a) * ud + (a<<8)) & MASK;
				/* a = blend(a,a>>16,vd) */
				a = (((a>>24)-(a>>8)) * vd + a);
				a = (a>>8) & 0xFF;
				ca = FZ_COMBINE(ca, FZ_EXPAND(a));
			}
			if (ca != 0)
			{
				dga = drb & MASK;
				drb = (drb<<8) & MASK;
				cga = ga - (dga>>8);
				crb = rb - (drb>>8);
				dga += cga * ca;
				drb += crb * ca;
				dga &= MASK;
				drb &= MASK;
				drb = dga | (drb>>8);
				dst32[-1] = drb;
			}
			u += fa;
			v += fb;
		}
	}
	else
	{
		while (len--)
		{
			unsigned int ca, drb, dga, crb, cga;
			unsigned int a, b;
			cov += *src; *src = 0; src++;
			drb = *dst32++;
			if (cov != 0)
			{
				int ui, ui1, vi, vi1, ud, vd;
				/* a = samplemask(samples, w, h, u, v); */
				ui1 = 1;
				ui = u >> 16;
				if (ui < 0)
				{
					ui = 0;
					ui1 = 0;
				}
				else if (ui >= w-1)
				{
					ui = w-1;
					ui1 = 0;
				}
				vi1 = w;
				vi = v >> 16;
				if (vi < 0)
				{
					vi = 0;
					vi1 = 0;
				}
				else if (vi >= h)
				{
					vi = h;
					vi1 = 0;
				}
				ui += vi*w;
				a = samples[ui];
				b = samples[ui + ui1];
				a |= samples[ui + vi1]<<16;
				b |= samples[ui + ui1 + vi1]<<16;
				ud = (u>>8) & 0xFF;
				vd = (v>>8) & 0xFF;
				ud = FZ_EXPAND(ud);
				vd = FZ_EXPAND(vd);
				/* a = blend(a,b,ud) */
				a = ((b-a) * ud + (a<<8)) & MASK;
				/* a = blend(a,a>>16,vd) */
				a = (((a>>24)-(a>>8)) * vd + a);
				a = (a>>8) & 0xFF;
				ca = FZ_COMBINE(FZ_EXPAND(cov),FZ_EXPAND(a));
				if (ca != 0)
				{
					if (ca == 256)
					{
						drb = (ga<<8) | rb;
					}
					else
					{
						dga = drb & MASK;
						drb = (drb<<8) & MASK;
						cga = ga - (dga>>8);
						crb = rb - (drb>>8);
						dga += cga * ca;
						drb += crb * ca;
						dga &= MASK;
						drb &= MASK;
						drb = dga | (drb>>8);
					}
					dst32[-1] = drb;
				}
			}
			u += fa;
			v += fb;
		}
	}
}

static void
img_1o1_32bit(byte * restrict src, byte cov, int len, byte * restrict dst,
	fz_pixmap *image, int u, int v, int fa, int fb)
{
	byte *samples = image->samples;
	int w = image->w;
	int h = image->h-1;

	while (len--)
	{
		unsigned int a, b;
		cov += *src; *src = 0; src++;
		if (cov != 0)
		{
			int ui, ui1, vi, vi1, ud, vd;
			/* sa = samplemask(samples, w, h, u, v); */
			ui1 = 1;
			ui = u >> 16;
			if (ui < 0)
			{
				ui = 0;
				ui1 = 0;
			}
			else if (ui >= w-1)
			{
				ui = w-1;
				ui1 = 0;
			}
			vi1 = w;
			vi = v >> 16;
			if (vi < 0)
			{
				vi = 0;
				vi1 = 0;
			}
			else if (vi >= h)
			{
				vi = h;
				vi1 = 0;
			}
			ui += vi*w;
			a = samples[ui];
			b = samples[ui + ui1];
			a |= samples[ui + vi1]<<16;
			b |= samples[ui + ui1 + vi1]<<16;
			ud = (u>>8) & 0xFF;
			vd = (v>>8) & 0xFF;
			ud = FZ_EXPAND(ud);
			vd = FZ_EXPAND(vd);
			/* a = blend(a,b,ud) */
			a = ((b-a) * ud + (a<<8)) & MASK;
			/* a = blend(a,a>>16,vd) */
			a = (((a>>24)-(a>>8)) * vd + a);
			a = (a>>8) & 0xFF;
			a = FZ_COMBINE(FZ_EXPAND(a), FZ_EXPAND(cov));
			if (a != 0)
			{
				if (a == 256)
					dst[0] = 255;
				else
					dst[0] = FZ_BLEND(255, dst[0], a);
			}
		}
		dst++;
		u += fa;
		v += fb;
	}
}

#endif

void fz_accelerate(void)
{
	if (sizeof(int) == 4 && sizeof(unsigned int) == 4 && !fz_is_big_endian())
	{
//		fz_path_w4i1o4 = path_w4i1o4_32bit;
//		fz_text_w4i1o4 = text_w4i1o4_32bit;
//		fz_img_4o4 = img_4o4_32bit;
//		fz_img_w4i1o4 = img_w4i1o4_32bit;
//		fz_img_1o1 = img_1o1_32bit;
	}

#ifdef HAVE_CPUDEP
	fz_accelerate_arch();
#endif
}